diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7a3345a808..aa3d8ce8c6 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -115,6 +115,8 @@ set( MIOpen_Source
     execution_context.cpp
     reducetensor.cpp
     reducetensor_api.cpp
+    activ/problem_description.cpp
+    solver/activ/fwd_0.cpp
     include/miopen/buffer_info.hpp
     include/miopen/temp_file.hpp
     include/miopen/bfloat16.hpp
diff --git a/src/activ/problem_description.cpp b/src/activ/problem_description.cpp
new file mode 100644
index 0000000000..4a959032a9
--- /dev/null
+++ b/src/activ/problem_description.cpp
@@ -0,0 +1,75 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/activ/problem_description.hpp>
+#include <miopen/names.hpp>
+
+#include <sstream>
+
+namespace miopen {
+
+namespace activ {
+
+NetworkConfig ProblemDescription::MakeNetworkConfig() const
+{
+    // short cut for packed tensors and 2D tensors with stride != width
+    const auto& x_lens = xDesc.GetLengths();
+
+    const auto x_elem_sz = xDesc.GetElementSize();
+
+    const auto x_width2D =
+        ((x_lens.size() == 2) ? x_lens[1] : (x_lens.size() == 3) ? x_lens[2] : (x_lens.size() == 4)
+                                                                                   ? x_lens[3]
+                                                                                   : x_lens[4]);
+
+    const auto height =
+        (x_lens.size() == 2) ? x_lens[0] : (x_lens.size() == 3) ? x_lens[1] : (x_lens.size() == 4)
+                                                                                  ? x_lens[2]
+                                                                                  : x_lens[3];
+
+    const auto packed = xDesc.IsPacked() && yDesc.IsPacked();
+
+    const auto read_len = (packed) ? x_elem_sz : x_width2D;
+
+    const auto read_unit = (read_len % 4 == 0) ? 4 : (read_len % 2 == 0) ? 2 : 1;
+    const auto MAP_RD    = read_len / read_unit;
+
+    std::ostringstream ss;
+
+    ss << "activ-";
+    ss << ((packed) ? "11" : "10"); // + lite bit
+    ss << xDesc.GetType();
+    ss << activDesc.GetMode();
+    ss << read_unit;
+    ss << MAP_RD;
+    ss << height;
+
+    return NetworkConfig{ss.str()};
+}
+
+} // namespace activ
+
+} // namespace miopen
diff --git a/src/binary_cache.cpp b/src/binary_cache.cpp
index 03abeff9cb..05204bf4c2 100644
--- a/src/binary_cache.cpp
+++ b/src/binary_cache.cpp
@@ -60,17 +60,15 @@ static boost::filesystem::path ComputeSysCachePath()
 static boost::filesystem::path ComputeUserCachePath()
 {
 #ifdef MIOPEN_CACHE_DIR
-    std::string cache_dir = MIOPEN_CACHE_DIR;
-    std::string version;
-
-    version = std::to_string(MIOPEN_VERSION_MAJOR) + "." + std::to_string(MIOPEN_VERSION_MINOR) +
-              "." + std::to_string(MIOPEN_VERSION_PATCH) + "." +
-              MIOPEN_STRINGIZE(MIOPEN_VERSION_TWEAK);
-    auto p = boost::filesystem::path{miopen::ExpandUser(cache_dir)} / version;
+    const std::string cache_dir = MIOPEN_CACHE_DIR;
+    const std::string version =
+        std::to_string(MIOPEN_VERSION_MAJOR) + "." + std::to_string(MIOPEN_VERSION_MINOR) + "." +
+        std::to_string(MIOPEN_VERSION_PATCH) + "." + MIOPEN_STRINGIZE(MIOPEN_VERSION_TWEAK);
 
     const char* const custom = miopen::GetStringEnv(MIOPEN_CUSTOM_CACHE_DIR{});
-    if(custom != nullptr && strlen(custom) > 0)
-        p = boost::filesystem::path{miopen::ExpandUser(custom)};
+    const auto p             = (custom != nullptr && strlen(custom) > 0)
+                       ? boost::filesystem::path{miopen::ExpandUser(custom)}
+                       : boost::filesystem::path{miopen::ExpandUser(cache_dir)} / version;
 
     if(!boost::filesystem::exists(p) && !MIOPEN_DISABLE_USERDB)
         boost::filesystem::create_directories(p);
diff --git a/src/expanduser.cpp b/src/expanduser.cpp
index 96732c3a13..1a0820fabe 100644
--- a/src/expanduser.cpp
+++ b/src/expanduser.cpp
@@ -7,7 +7,7 @@ MIOPEN_DECLARE_ENV_VAR(HOME)
 
 namespace miopen {
 
-std::string ExpandUser(std::string p)
+std::string ExpandUser(const std::string& p)
 {
     const char* home_dir = GetStringEnv(HOME{});
     if(home_dir == nullptr || home_dir == std::string("/") || home_dir == std::string(""))
diff --git a/src/hip/handlehip.cpp b/src/hip/handlehip.cpp
index 5c2a05a555..f042336591 100644
--- a/src/hip/handlehip.cpp
+++ b/src/hip/handlehip.cpp
@@ -503,12 +503,11 @@ std::size_t Handle::GetGlobalMemorySize() const
 
 std::size_t Handle::GetMaxComputeUnits() const
 {
+    const std::size_t num_cu = Value(MIOPEN_DEVICE_CU{});
+    if(num_cu > 0)
+        return num_cu;
+
     int result;
-    const char* const num_cu = miopen::GetStringEnv(MIOPEN_DEVICE_CU{});
-    if(num_cu != nullptr && strlen(num_cu) > 0)
-    {
-        return boost::lexical_cast<std::size_t>(num_cu);
-    }
     auto status =
         hipDeviceGetAttribute(&result, hipDeviceAttributeMultiprocessorCount, this->impl->device);
     if(status != hipSuccess)
diff --git a/src/hipoc/hipoc_program.cpp b/src/hipoc/hipoc_program.cpp
index 91cf2ae20a..ec8a3f15e8 100644
--- a/src/hipoc/hipoc_program.cpp
+++ b/src/hipoc/hipoc_program.cpp
@@ -58,8 +58,6 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_OPENCL_ENFORCE_CODE_OBJECT_OPTION)
 MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_OPENCL_ENFORCE_CODE_OBJECT_VERSION)
 MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEVICE_ARCH)
 
-#define MIOPEN_WORKAROUND_SWDEV_225285 1
-
 #if MIOPEN_USE_COMGR
 #define MIOPEN_WORKAROUND_ROCM_COMPILER_SUPPORT_ISSUE_27 1
 #endif
@@ -164,19 +162,12 @@ static hipModulePtr CreateModule(const boost::filesystem::path& hsaco_file)
 template <typename T> /// intended for std::string and std::vector<char>
 hipModulePtr CreateModuleInMem(const T& blob)
 {
-#if !MIOPEN_WORKAROUND_SWDEV_225285
     hipModule_t raw_m;
     auto status = hipModuleLoadData(&raw_m, reinterpret_cast<const void*>(blob.data()));
     hipModulePtr m{raw_m};
     if(status != hipSuccess)
         MIOPEN_THROW_HIP_STATUS(status, "Failed loading module");
     return m;
-#else
-    TmpDir tmp_dir("miopen");
-    auto file_path = tmp_dir.path / boost::filesystem::unique_path("miopen-%%%%-%%%%-%%%%-%%%%");
-    WriteFile(blob, file_path);
-    return CreateModule(file_path);
-#endif
 }
 
 HIPOCProgramImpl::HIPOCProgramImpl(const std::string& program_name,
@@ -187,16 +178,12 @@ HIPOCProgramImpl::HIPOCProgramImpl(const std::string& program_name,
 }
 
 HIPOCProgramImpl::HIPOCProgramImpl(const std::string& program_name, const std::string& blob)
-    : program(program_name)
+    : program(program_name) ///, module(CreateModuleInMem(blob))
 {
-    TmpDir tmp_dir("miopen");
-    auto file_path = tmp_dir.path / boost::filesystem::unique_path("miopen-%%%%-%%%%-%%%%-%%%%");
-    WriteFile(blob, file_path);
-    const char* const arch = miopen::GetStringEnv(MIOPEN_DEVICE_ARCH{});
-    if(arch == nullptr)
-    {
-        this->module = CreateModule(file_path);
-    }
+    if(nullptr !=
+       miopen::GetStringEnv(MIOPEN_DEVICE_ARCH{})) /// \todo Finish off this spaghetti eventually.
+        return;
+    module = CreateModuleInMem(blob);
 }
 
 HIPOCProgramImpl::HIPOCProgramImpl(const std::string& program_name,
diff --git a/src/include/miopen/activ/invoke_params.hpp b/src/include/miopen/activ/invoke_params.hpp
new file mode 100644
index 0000000000..139bb3a8d2
--- /dev/null
+++ b/src/include/miopen/activ/invoke_params.hpp
@@ -0,0 +1,52 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+namespace activ {
+
+struct InvokeParams : public miopen::InvokeParams
+{
+    InvokeParams() = default;
+
+    double alpha = 0;
+    TensorDescriptor x_desc;
+    ConstData_t x = nullptr;
+    double beta   = 0;
+    TensorDescriptor y_desc;
+    Data_t y        = nullptr;
+    double gamma    = 0;
+    size_t x_offset = 0;
+    size_t y_offset = 0;
+};
+
+} // namespace activ
+
+} // namespace miopen
diff --git a/src/include/miopen/activ/problem_description.hpp b/src/include/miopen/activ/problem_description.hpp
new file mode 100644
index 0000000000..de166ca0aa
--- /dev/null
+++ b/src/include/miopen/activ/problem_description.hpp
@@ -0,0 +1,80 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <miopen/activ.hpp>
+#include <miopen/tensor.hpp>
+
+#include <string>
+
+namespace miopen {
+
+struct NetworkConfig;
+
+namespace activ {
+
+enum class Direction
+{
+    Forward,
+    Backward,
+};
+
+struct ProblemDescription
+{
+    ProblemDescription(Direction direction_,
+                       const ActivationDescriptor& activ,
+                       const TensorDescriptor& xDesc_,
+                       const TensorDescriptor& yDesc_)
+        : direction(direction_), activDesc(activ), xDesc(xDesc_), yDesc(yDesc_)
+    {
+    }
+
+    Direction GetDirection() const { return direction; }
+    const ActivationDescriptor& GetActivDesc() const { return activDesc; }
+    const TensorDescriptor& GetXDesc() const { return xDesc; }
+    const TensorDescriptor& GetYDesc() const { return yDesc; }
+
+    NetworkConfig MakeNetworkConfig() const;
+
+    void Serialize(std::ostream& stream) const;
+
+    friend std::ostream& operator<<(std::ostream& os, const ProblemDescription& obj)
+    {
+        obj.Serialize(os);
+        return os;
+    }
+
+    private:
+    Direction direction;
+    ActivationDescriptor activDesc;
+    TensorDescriptor xDesc;
+    TensorDescriptor yDesc;
+};
+
+} // namespace activ
+
+} // namespace miopen
diff --git a/src/include/miopen/activ/solvers.hpp b/src/include/miopen/activ/solvers.hpp
new file mode 100644
index 0000000000..cc7082a631
--- /dev/null
+++ b/src/include/miopen/activ/solvers.hpp
@@ -0,0 +1,53 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <miopen/solver.hpp>
+
+namespace miopen {
+
+namespace activ {
+struct ProblemDescription;
+} // namespace activ
+
+namespace solver {
+
+namespace activ {
+
+struct ActivFwdSolver0 : public SolverBase<ProblemDescription>
+{
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::activ::ProblemDescription& problem) const;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::activ::ProblemDescription& problem) const;
+};
+
+} // namespace activ
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/include/miopen/execution_context.hpp b/src/include/miopen/execution_context.hpp
index 85e610e0f9..59dd4f9fc0 100644
--- a/src/include/miopen/execution_context.hpp
+++ b/src/include/miopen/execution_context.hpp
@@ -84,6 +84,7 @@ struct ExecutionContext
     inline void SetStream(Handle* stream_) { stream = stream_; }
 
     ExecutionContext() = default;
+    ExecutionContext(Handle* stream_) : stream(stream_) {}
 
     void DetectRocm();
 
diff --git a/src/include/miopen/expanduser.hpp b/src/include/miopen/expanduser.hpp
index f97d7be796..b6e49b3c50 100644
--- a/src/include/miopen/expanduser.hpp
+++ b/src/include/miopen/expanduser.hpp
@@ -30,7 +30,7 @@
 
 namespace miopen {
 
-std::string ExpandUser(std::string p);
+std::string ExpandUser(const std::string& p);
 
 } // namespace miopen
 
diff --git a/src/include/miopen/find_solution.hpp b/src/include/miopen/find_solution.hpp
index 99534f95e7..c7908ca63f 100644
--- a/src/include/miopen/find_solution.hpp
+++ b/src/include/miopen/find_solution.hpp
@@ -182,6 +182,55 @@ struct SolverContainer
         return ss;
     }
 
+    // Search for all applicable solutions among many solvers
+    template <class Problem, class Solution = miopen::solver::ConvSolution>
+    std::vector<Solution>
+    SearchForSolutions(const ExecutionContext& ctx,
+                       const Problem& problem,
+                       std::size_t limit = std::numeric_limits<std::size_t>::max()) const
+    {
+        std::vector<Solution> ss;
+        std::size_t count    = 0;
+        const auto find_only = GetEnvFindOnlySolver();
+        miopen::each_args(
+            [&](auto solver) {
+                if(count >= limit)
+                    return;
+                if(find_only.IsValid() && find_only != Id{SolverDbId(solver)})
+                { // Do nothing (and keep silence for the sake of Tuna), just skip.
+                }
+                // For better performance, check IsDynamic() first, because
+                // it is much faster than IsApplicable().
+                // else if(problem.use_dynamic_solutions_only && !solver.IsDynamic())
+                //    MIOPEN_LOG_I2(SolverDbId(solver) << ": Skipped (non-dynamic)");
+                else if(!solver.IsApplicable(ctx, problem))
+                    MIOPEN_LOG_I2(SolverDbId(solver) << ": Not applicable");
+                else
+                {
+                    auto s      = solver.GetSolution(ctx, problem);
+                    s.solver_id = SolverDbId(solver);
+                    if(s.Succeeded())
+                    {
+                        ++count;
+                        ss.push_back(s);
+                        MIOPEN_LOG_I2(SolverDbId(solver) << ": Success.");
+                    }
+                    else
+                    {
+                        /// \todo If Solver is applicable it must provide an appropriate Solution.
+                        /// This is not the case for some 20x5 convolutions (and possibly others).
+                        /// Normally we should not get here and message level should be Error.
+                        /// For now, let's use Info (not Warning) level to avoid
+                        /// flooding the console.
+                        MIOPEN_LOG_I(SolverDbId(solver)
+                                     << ": [Warning] Applicable Solver not succeeded.");
+                    }
+                }
+            },
+            Solvers{}...);
+        return ss;
+    }
+
     template <class Context>
     std::vector<std::pair<std::string, size_t>>
     GetWorkspaceSize(const Context& search_params,
diff --git a/src/include/miopen/handle.hpp b/src/include/miopen/handle.hpp
index 51f947be0b..58bd3eaf6d 100644
--- a/src/include/miopen/handle.hpp
+++ b/src/include/miopen/handle.hpp
@@ -224,11 +224,11 @@ struct Handle : miopenHandle
 
     void RegisterInvoker(const Invoker& invoker,
                          const NetworkConfig& config,
-                         solver::Id solver,
+                         const std::string& solver,
                          const AlgorithmName& algo)
     {
-        invokers.Register({config, solver.ToString()}, invoker);
-        invokers.SetAsFound1_0(config, algo, solver.ToString());
+        invokers.Register({config, solver}, invoker);
+        invokers.SetAsFound1_0(config, algo, solver);
     }
 
     boost::optional<const Invoker&>
diff --git a/src/include/miopen/stringutils.hpp b/src/include/miopen/stringutils.hpp
index 3521fff99a..548aeb311f 100644
--- a/src/include/miopen/stringutils.hpp
+++ b/src/include/miopen/stringutils.hpp
@@ -41,9 +41,10 @@
 namespace miopen {
 
 inline std::string
-ReplaceString(std::string subject, const std::string& search, const std::string& replace)
+ReplaceString(const std::string& in, const std::string& search, const std::string& replace)
 {
     size_t pos = 0;
+    std::string subject(in);
     while((pos = subject.find(search, pos)) != std::string::npos)
     {
         subject.replace(pos, search.length(), replace);
diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/reduction_functions.hpp b/src/kernels/composable_kernel/include/kernel_algorithm/reduction_functions.hpp
index 4ed6bb6fd2..8772270693 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/reduction_functions.hpp
+++ b/src/kernels/composable_kernel/include/kernel_algorithm/reduction_functions.hpp
@@ -50,10 +50,12 @@ struct binop_with_nan_check<NanPropagation_t::NOT_PROPAGATE_NAN, opReduce, compT
     };
 
     // The method is called when the opReduce is indexable and the user asked for indices
-    __device__ static inline void
-    calculate(const compType& accuVal, compType currVal, int& accuIndex, int currIndex)
+    __device__ static inline void calculate(const compType& accuVal,
+                                            compType currVal,
+                                            VOLATILE_WA_274384 int& accuIndex,
+                                            int currIndex)
     {
-        bool changed = false;
+        VOLATILE_WA_274384 bool changed = false;
 
         opReduce{}(const_cast<compType&>(accuVal), currVal, changed);
 
@@ -75,7 +77,7 @@ struct binop_with_nan_check<NanPropagation_t::PROPAGATE_NAN, opReduce, compType>
 
     // The method is called when the opReduce is indexable and the user asked for indices
     __device__ static inline void
-    calculate(compType& accuVal, compType currVal, int& accuIndex, int currIndex)
+    calculate(compType& accuVal, compType currVal, VOLATILE_WA_274384 int& accuIndex, int currIndex)
     {
         if(isnan(currVal))
         {
@@ -84,7 +86,7 @@ struct binop_with_nan_check<NanPropagation_t::PROPAGATE_NAN, opReduce, compType>
         }
         else
         {
-            bool changed = false;
+            VOLATILE_WA_274384 bool changed = false;
 
             opReduce{}(accuVal, currVal, changed);
 
@@ -527,9 +529,9 @@ struct BlockwiseReduction_2d_block_buffer
                                    compType& accuData,
                                    int& accuIndex)
     {
-        const index_t thread_local_id = get_thread_local_1d_id();
-        compType lAccuData            = opReduce::GetZeroVal();
-        int lAccuIndex                = 0;
+        const index_t thread_local_id     = get_thread_local_1d_id();
+        compType lAccuData                = opReduce::GetZeroVal();
+        VOLATILE_WA_274384 int lAccuIndex = 0;
 
         static_if<blockIsOneRow>{}([&](auto) {
             for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
diff --git a/src/kernels/composable_kernel/include/utility/reduction_common.hpp b/src/kernels/composable_kernel/include/utility/reduction_common.hpp
index fc389efcb2..e1c5c2f2df 100644
--- a/src/kernels/composable_kernel/include/utility/reduction_common.hpp
+++ b/src/kernels/composable_kernel/include/utility/reduction_common.hpp
@@ -28,6 +28,14 @@
 
 #include "float_type.hpp"
 
+#define WORKAROUND_SWDEV_274384 (HIP_PACKAGE_VERSION_FLAT >= 4002021203ULL)
+
+#if WORKAROUND_SWDEV_274384
+#define VOLATILE_WA_274384 volatile
+#else
+#define VOLATILE_WA_274384
+#endif
+
 // this enumerate should be synchronized with include/miopen/reduce_common.hpp
 namespace ck {
 enum class ReductionMethod_t
diff --git a/src/kernels/composable_kernel/include/utility/reduction_operator.hpp b/src/kernels/composable_kernel/include/utility/reduction_operator.hpp
index 3019cbd9e4..a9c041895d 100644
--- a/src/kernels/composable_kernel/include/utility/reduction_operator.hpp
+++ b/src/kernels/composable_kernel/include/utility/reduction_operator.hpp
@@ -93,15 +93,13 @@ struct Max
             a = b;
     }
 
-    __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
+    __device__ inline constexpr void operator()(T& a, T b, VOLATILE_WA_274384 bool& changed) const
     {
         if(a < b)
         {
             a       = b;
             changed = true;
         }
-        else
-            changed = false;
     }
 
     static constexpr bool indexable = true;
@@ -120,15 +118,13 @@ struct Min
             a = b;
     }
 
-    __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
+    __device__ inline constexpr void operator()(T& a, T b, VOLATILE_WA_274384 bool& changed) const
     {
         if(a > b)
         {
             a       = b;
             changed = true;
         }
-        else
-            changed = false;
     }
 
     static constexpr bool indexable = true;
diff --git a/src/ocl/activ_ocl.cpp b/src/ocl/activ_ocl.cpp
index 08f70281e9..0d317a3f63 100644
--- a/src/ocl/activ_ocl.cpp
+++ b/src/ocl/activ_ocl.cpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -28,6 +28,10 @@
 #include <miopen/mlo_internal.hpp>
 #include <miopen/float_equal.hpp>
 #include <miopen/visit_float.hpp>
+#include <miopen/activ/invoke_params.hpp>
+#include <miopen/activ/problem_description.hpp>
+#include <miopen/activ/solvers.hpp>
+#include <miopen/find_solution.hpp>
 
 namespace miopen {
 
@@ -46,6 +50,49 @@ miopenStatus_t ActivationDescriptor::Forward(Handle& handle,
     {
         MIOPEN_THROW("Only alpha=1 and beta=0 is supported");
     }
+
+    const auto problem = activ::ProblemDescription{activ::Direction::Forward, *this, xDesc, yDesc};
+
+    const auto invoke_params = [&]() {
+        auto tmp     = activ::InvokeParams{};
+        tmp.type     = InvokeType::Run;
+        tmp.alpha    = GetAlpha();
+        tmp.beta     = GetBeta();
+        tmp.gamma    = GetGamma();
+        tmp.x        = x;
+        tmp.x_desc   = xDesc;
+        tmp.y        = y;
+        tmp.y_desc   = yDesc;
+        tmp.x_offset = xOffset;
+        tmp.y_offset = yOffset;
+        return tmp;
+    }();
+
+    const auto algo           = AlgorithmName{"miopenActivationForward"};
+    const auto network_config = problem.MakeNetworkConfig();
+
+    if(const auto invoker = handle.GetInvoker(network_config, boost::none, algo))
+    {
+        (*invoker)(handle, invoke_params);
+        return miopenStatusSuccess;
+    }
+
+    const auto ctx     = ExecutionContext{&handle};
+    const auto solvers = solver::SolverContainer<solver::activ::ActivFwdSolver0>{};
+    const auto slns    = solvers.SearchForSolutions(ctx, problem, 1);
+
+    if(!slns.empty())
+    {
+        const auto& sln = slns.front();
+        if(!sln.invoker_factory)
+            MIOPEN_THROW("Invoker missing in solver " + sln.solver_id);
+        const auto invoker = handle.PrepareInvoker(*sln.invoker_factory, sln.construction_params);
+        handle.RegisterInvoker(invoker, network_config, sln.solver_id, algo);
+        invoker(handle, invoke_params);
+        return miopenStatusSuccess;
+    }
+
+    // legacy part start
     miopenStatus_t status = miopenStatusSuccess;
     mlo_construct_neuron construct_params(conv::Direction::Forward);
 
@@ -53,8 +100,6 @@ miopenStatus_t ActivationDescriptor::Forward(Handle& handle,
     double activ_beta  = GetBeta();
     double activ_gamma = GetGamma();
 
-    std::string network_config{};
-
     // short cut for packed tensors and 2D tensors with stride != width
     auto x_lens = xDesc.GetLengths();
     auto y_lens = yDesc.GetLengths();
@@ -62,325 +107,159 @@ miopenStatus_t ActivationDescriptor::Forward(Handle& handle,
     auto x_strides = xDesc.GetStrides();
     auto y_strides = yDesc.GetStrides();
 
-    auto x_elem_sz = xDesc.GetElementSize();
-    auto y_elem_sz = yDesc.GetElementSize();
-
-    auto x_stride2D = static_cast<unsigned int>(
-        (x_lens.size() == 2) ? x_strides[0] : (x_lens.size() == 3)
-                                                  ? x_strides[1]
-                                                  : (x_lens.size() == 4) ? x_strides[2]
-                                                                         : x_strides[3]);
-    auto y_stride2D = static_cast<unsigned int>(
-        (y_lens.size() == 2) ? y_strides[0] : (y_lens.size() == 3)
-                                                  ? y_strides[1]
-                                                  : (y_lens.size() == 4) ? y_strides[2]
-                                                                         : y_strides[3]);
-
-    auto x_width2D =
-        ((x_lens.size() == 2) ? x_lens[1] : (x_lens.size() == 3) ? x_lens[2] : (x_lens.size() == 4)
-                                                                                   ? x_lens[3]
-                                                                                   : x_lens[4]);
-
-    auto y_width2D =
-        ((y_lens.size() == 2) ? y_lens[1] : (y_lens.size() == 3) ? y_lens[2] : (y_lens.size() == 4)
-                                                                                   ? y_lens[3]
-                                                                                   : y_lens[4]);
-
-    bool t2D = (x_lens.size() == y_lens.size() &&
-                ((x_width2D != x_stride2D) || (y_width2D != y_stride2D)) &&
-                (x_lens.size() == 2 || (x_lens.size() == 3 && x_lens[0] == 1 && y_lens[0] == 1) ||
-                 (x_lens.size() == 4 && x_lens[0] == 1 && x_lens[1] == 1 && y_lens[0] == 1 &&
-                  y_lens[1] == 1) ||
-                 (x_lens.size() == 5 && x_lens[0] == 1 && x_lens[1] == 1 && x_lens[2] == 1 &&
-                  y_lens[0] == 1 && y_lens[1] == 1 && y_lens[2] == 1)));
-    bool packed = xDesc.IsPacked() && yDesc.IsPacked();
-
     visit_float(xDesc.GetType(), [&](auto as_float) {
-
-        if(x_elem_sz == y_elem_sz && (packed || t2D))
+        construct_params.setStream(&handle);
+
+        int nOut       = 1;
+        int cOut       = 1;
+        int hOut       = 1;
+        int wOut       = 1;
+        int nOutStride = 0;
+        int cOutStride = 0;
+        int hOutStride = 0;
+        int wOutStride = 0;
+
+        if(yDesc.GetSize() == 4)
         {
-            std::string compiler_options;
-            auto f_activ_alpha = as_float(activ_alpha);
-            auto f_activ_beta  = as_float(activ_beta);
-            auto f_activ_gamma = as_float(activ_gamma);
-
-            size_t height = (x_lens.size() == 2) ? x_lens[0] : (x_lens.size() == 3)
-                                                                   ? x_lens[1]
-                                                                   : (x_lens.size() == 4)
-                                                                         ? x_lens[2]
-                                                                         : x_lens[3];
-
-            size_t read_len = (packed) ? x_elem_sz : x_width2D;
-
-            size_t read_unit = (read_len % 4 == 0) ? 4 : (read_len % 2 == 0) ? 2 : 1;
-            size_t MAP_RD    = read_len / read_unit;
-
-            const std::string READ_TYPE =
-                (read_unit == 1) ? "_FLOAT" : "_FLOAT" + std::to_string(read_unit);
-
-            network_config = ((packed) ? "11" : "10") // + lite bit
-                             + std::to_string(xDesc.GetType()) + std::to_string(mode) +
-                             std::to_string(read_unit) + std::to_string(MAP_RD) +
-                             std::to_string(height);
-
-            auto&& kernels = handle.GetKernels("miopenActivationForward", network_config);
-            if(!kernels.empty())
-            {
-                auto kernel = kernels.front();
-                if(packed)
-                {
-                    kernel(x,
-                           y,
-                           f_activ_gamma,
-                           f_activ_beta,
-                           f_activ_alpha,
-                           static_cast<long long>(xOffset),
-                           static_cast<long long>(yOffset));
-                }
-                else
-                {
-                    kernel(x,
-                           y,
-                           f_activ_gamma,
-                           f_activ_beta,
-                           f_activ_alpha,
-                           static_cast<long long>(xOffset),
-                           static_cast<long long>(yOffset),
-                           x_stride2D,
-                           y_stride2D);
-                }
-            }
-            else
+            std::tie(nOut, cOut, hOut, wOut)                         = tien<4>(yDesc.GetLengths());
+            std::tie(nOutStride, cOutStride, hOutStride, wOutStride) = tien<4>(yDesc.GetStrides());
+        }
+        else if(yDesc.GetSize() < 4 && yDesc.GetSize() > 0)
+        {
+            auto tensor_size = yDesc.GetSize();
+            switch(tensor_size)
             {
-                std::string type_opt;
-                if(xDesc.GetType() == miopenFloat)
-                {
-                    type_opt = " -DMIOPEN_USE_FP16=0 -DMIOPEN_USE_FP32=1";
-                }
-                else if(xDesc.GetType() == miopenHalf)
-                {
-                    type_opt = " -DMIOPEN_USE_FP16=1 -DMIOPEN_USE_FP32=0";
-                }
-
-                compiler_options = " -DLITE -DMIOPEN_READ_UNIT=" + std::to_string(read_unit) +
-                                   " -DMIOPEN_READ_TYPE=" + READ_TYPE + " -DMIOPEN_NRN_OP_ID=" +
-                                   std::to_string(mode) + type_opt;
-
-                std::vector<size_t> vld;
-                std::vector<size_t> vgd;
-
-                vld.push_back(256);
-                vld.push_back(1);
-                vld.push_back(1);
-
-                vgd.push_back(MAP_RD);
-
-                std::string program_name = "MIOpenNeuron.cl";
-                std::string kernel_name =
-                    (packed) ? "MIOpenActiveFwdLite" : "MIOpenActiveFwd2DLite";
-                if(packed)
-                {
-                    vgd.push_back(1);
-                    vgd.push_back(1);
-
-                    handle.AddKernel("miopenActivationForward",
-                                     network_config,
-                                     program_name,
-                                     kernel_name,
-                                     vld,
-                                     vgd,
-                                     compiler_options)(x,
-                                                       y,
-                                                       as_float(f_activ_gamma),
-                                                       as_float(f_activ_beta),
-                                                       as_float(f_activ_alpha),
-                                                       static_cast<long long>(xOffset),
-                                                       static_cast<long long>(yOffset));
-                }
-                else
-                {
-
-                    vgd.push_back(height);
-                    vgd.push_back(1);
-
-                    handle.AddKernel("miopenActivationForward",
-                                     network_config,
-                                     program_name,
-                                     kernel_name,
-                                     vld,
-                                     vgd,
-                                     compiler_options)(x,
-                                                       y,
-                                                       as_float(f_activ_gamma),
-                                                       as_float(f_activ_beta),
-                                                       as_float(f_activ_alpha),
-                                                       static_cast<long long>(xOffset),
-                                                       static_cast<long long>(yOffset),
-                                                       x_stride2D,
-                                                       y_stride2D);
-                }
+            case 1:
+                std::tie(wOut)       = tien<1>(yDesc.GetLengths());
+                std::tie(wOutStride) = tien<1>(yDesc.GetStrides());
+                nOutStride           = wOut * wOutStride;
+                cOutStride           = wOut * wOutStride;
+                hOutStride           = wOut * wOutStride;
+                break;
+            case 2:
+                std::tie(hOut, wOut)             = tien<2>(yDesc.GetLengths());
+                std::tie(hOutStride, wOutStride) = tien<2>(yDesc.GetStrides());
+                nOutStride = hOut * hOutStride;
+                cOutStride = hOut * hOutStride;
+                break;
+            case 3:
+                std::tie(cOut, hOut, wOut)                   = tien<3>(yDesc.GetLengths());
+                std::tie(cOutStride, hOutStride, wOutStride) = tien<3>(yDesc.GetStrides());
+                nOutStride = cOut * cOutStride;
+                break;
+            default: assert(false);
             }
         }
         else
         {
-            construct_params.setStream(&handle);
-
-            int nOut       = 1;
-            int cOut       = 1;
-            int hOut       = 1;
-            int wOut       = 1;
-            int nOutStride = 0;
-            int cOutStride = 0;
-            int hOutStride = 0;
-            int wOutStride = 0;
-
-            if(yDesc.GetSize() == 4)
-            {
-                std::tie(nOut, cOut, hOut, wOut) = tien<4>(yDesc.GetLengths());
-                std::tie(nOutStride, cOutStride, hOutStride, wOutStride) =
-                    tien<4>(yDesc.GetStrides());
-            }
-            else if(yDesc.GetSize() < 4 && yDesc.GetSize() > 0)
-            {
-                auto tensor_size = yDesc.GetSize();
-                switch(tensor_size)
-                {
-                case 1:
-                    std::tie(wOut)       = tien<1>(yDesc.GetLengths());
-                    std::tie(wOutStride) = tien<1>(yDesc.GetStrides());
-                    nOutStride           = wOut * wOutStride;
-                    cOutStride           = wOut * wOutStride;
-                    hOutStride           = wOut * wOutStride;
-                    break;
-                case 2:
-                    std::tie(hOut, wOut)             = tien<2>(yDesc.GetLengths());
-                    std::tie(hOutStride, wOutStride) = tien<2>(yDesc.GetStrides());
-                    nOutStride = hOut * hOutStride;
-                    cOutStride = hOut * hOutStride;
-                    break;
-                case 3:
-                    std::tie(cOut, hOut, wOut)                   = tien<3>(yDesc.GetLengths());
-                    std::tie(cOutStride, hOutStride, wOutStride) = tien<3>(yDesc.GetStrides());
-                    nOutStride = cOut * cOutStride;
-                    break;
-                default: assert(false);
-                }
-            }
-            else
-            {
-                MIOPEN_THROW(
-                    "activation does not support tensor size larger than 4 or smaller than 1");
-            }
-
-            construct_params.setTopDescFromMLDesc(yDesc);
-            int nIn       = 1;
-            int cIn       = 1;
-            int hIn       = 1;
-            int wIn       = 1;
-            int nInStride = 0;
-            int cInStride = 0;
-            int hInStride = 0;
-            int wInStride = 0;
+            MIOPEN_THROW("activation does not support tensor size larger than 4 or smaller than 1");
+        }
 
-            if(xDesc.GetSize() == 4)
-            {
-                std::tie(nIn, cIn, hIn, wIn)                         = tien<4>(xDesc.GetLengths());
-                std::tie(nInStride, cInStride, hInStride, wInStride) = tien<4>(xDesc.GetStrides());
-            }
-            else if(xDesc.GetSize() < 4 && xDesc.GetSize() > 0)
-            {
-                auto tensor_size = xDesc.GetSize();
-                switch(tensor_size)
-                {
-                case 1:
-                    std::tie(wIn)       = tien<1>(xDesc.GetLengths());
-                    std::tie(wInStride) = tien<1>(xDesc.GetStrides());
-                    nInStride           = wIn * wInStride;
-                    cInStride           = wIn * wInStride;
-                    hInStride           = wIn * wInStride;
-                    break;
-                case 2:
-                    std::tie(hIn, wIn)             = tien<2>(xDesc.GetLengths());
-                    std::tie(hInStride, wInStride) = tien<2>(xDesc.GetStrides());
-                    nInStride = hIn * hInStride;
-                    cInStride = hIn * hInStride;
-                    break;
-                case 3:
-                    std::tie(cIn, hIn, wIn)                   = tien<3>(xDesc.GetLengths());
-                    std::tie(cInStride, hInStride, wInStride) = tien<3>(xDesc.GetStrides());
-                    nInStride = cIn * cInStride;
-                    break;
-                default: assert(false);
-                }
-            }
-            else
+        construct_params.setTopDescFromMLDesc(yDesc);
+        int nIn       = 1;
+        int cIn       = 1;
+        int hIn       = 1;
+        int wIn       = 1;
+        int nInStride = 0;
+        int cInStride = 0;
+        int hInStride = 0;
+        int wInStride = 0;
+
+        if(xDesc.GetSize() == 4)
+        {
+            std::tie(nIn, cIn, hIn, wIn)                         = tien<4>(xDesc.GetLengths());
+            std::tie(nInStride, cInStride, hInStride, wInStride) = tien<4>(xDesc.GetStrides());
+        }
+        else if(xDesc.GetSize() < 4 && xDesc.GetSize() > 0)
+        {
+            auto tensor_size = xDesc.GetSize();
+            switch(tensor_size)
             {
-                MIOPEN_THROW(
-                    "Activation does not support tensor dimension larger than 4 or smaller than 1");
+            case 1:
+                std::tie(wIn)       = tien<1>(xDesc.GetLengths());
+                std::tie(wInStride) = tien<1>(xDesc.GetStrides());
+                nInStride           = wIn * wInStride;
+                cInStride           = wIn * wInStride;
+                hInStride           = wIn * wInStride;
+                break;
+            case 2:
+                std::tie(hIn, wIn)             = tien<2>(xDesc.GetLengths());
+                std::tie(hInStride, wInStride) = tien<2>(xDesc.GetStrides());
+                nInStride = hIn * hInStride;
+                cInStride = hIn * hInStride;
+                break;
+            case 3:
+                std::tie(cIn, hIn, wIn)                   = tien<3>(xDesc.GetLengths());
+                std::tie(cInStride, hInStride, wInStride) = tien<3>(xDesc.GetStrides());
+                nInStride = cIn * cInStride;
+                break;
+            default: assert(false);
             }
-
-            construct_params.setBotDescFromMLDesc(xDesc);
-
-            construct_params.setNeuronDescr(
-                static_cast<int>(mode), activ_gamma, activ_beta, activ_alpha);
-
-            mloConstruct(construct_params);
-
-            std::string program_name = construct_params.getKernelFile(); // CL kernel filename
-            std::string kernel_name  = construct_params.getKernelName(); // kernel name
-            std::string compiler_options =
-                construct_params.getCompilerOptions(); // kernel parameters
-
-            const std::vector<size_t>& vld = construct_params.getLocalWkSize();
-            const std::vector<size_t>& vgd = construct_params.getGlobalWkSize();
-
-            int imode = mode;
-            construct_params.getNeuronDescr(imode, activ_gamma, activ_beta, activ_alpha);
-
-            auto f_activ_alpha = as_float(activ_alpha);
-            auto f_activ_beta  = as_float(activ_beta);
-            auto f_activ_gamma = as_float(activ_gamma);
-
-            compiler_options +=
-                " -DMIOPEN_N_IN=" + std::to_string(nIn) + " -DMIOPEN_C_IN=" + std::to_string(cIn) +
-                " -DMIOPEN_H_IN=" + std::to_string(hIn) + " -DMIOPEN_W_IN=" + std::to_string(wIn) +
-                " -DMIOPEN_N_IN_STRIDE=" + std::to_string(nInStride) + " -DMIOPEN_C_IN_STRIDE=" +
-                std::to_string(cInStride) + " -DMIOPEN_H_IN_STRIDE=" + std::to_string(hInStride) +
-                " -DMIOPEN_W_IN_STRIDE=" + std::to_string(wInStride) + " -DMIOPEN_N_OUT=" +
-                std::to_string(nOut) + " -DMIOPEN_C_OUT=" + std::to_string(cOut) +
-                " -DMIOPEN_H_OUT=" + std::to_string(hOut) + " -DMIOPEN_W_OUT=" +
-                std::to_string(wOut) + " -DMIOPEN_N_OUT_STRIDE=" + std::to_string(nOutStride) +
-                " -DMIOPEN_C_OUT_STRIDE=" + std::to_string(cOutStride) + " -DMIOPEN_H_OUT_STRIDE=" +
-                std::to_string(hOutStride) + " -DMIOPEN_W_OUT_STRIDE=" +
-                std::to_string(wOutStride) + " -DMIOPEN_N_DIN=" + std::to_string(1) +
-                " -DMIOPEN_C_DIN=" + std::to_string(1) + " -DMIOPEN_H_DIN=" + std::to_string(1) +
-                " -DMIOPEN_W_DIN=" + std::to_string(1) + " -DMIOPEN_N_DIN_STRIDE=" +
-                std::to_string(1) + " -DMIOPEN_C_DIN_STRIDE=" + std::to_string(1) +
-                " -DMIOPEN_H_DIN_STRIDE=" + std::to_string(1) + " -DMIOPEN_W_DIN_STRIDE=" +
-                std::to_string(1) + " -DMIOPEN_N_DOUT=" + std::to_string(1) + " -DMIOPEN_C_DOUT=" +
-                std::to_string(1) + " -DMIOPEN_H_DOUT=" + std::to_string(1) + " -DMIOPEN_W_DOUT=" +
-                std::to_string(1) + " -DMIOPEN_N_DOUT_STRIDE=" + std::to_string(1) +
-                " -DMIOPEN_C_DOUT_STRIDE=" + std::to_string(1) + " -DMIOPEN_H_DOUT_STRIDE=" +
-                std::to_string(1) + " -DMIOPEN_W_DOUT_STRIDE=" + std::to_string(1) +
-                " -DMIOPEN_IN_BLOCK_SZ=" + std::to_string(cIn * hIn * wIn) +
-                " -DMIOPEN_OUT_BLOCK_SZ=" + std::to_string(cOut * hOut * wOut) +
-                " -DMIOPEN_DIN_BLOCK_SZ=" + std::to_string(1) + " -DMIOPEN_DOUT_BLOCK_SZ=" +
-                std::to_string(1);
-
-            handle.AddKernel("miopenActivationForward",
-                             network_config,
-                             program_name,
-                             kernel_name,
-                             vld,
-                             vgd,
-                             compiler_options)(x,
-                                               y,
-                                               as_float(f_activ_gamma),
-                                               as_float(f_activ_beta),
-                                               as_float(f_activ_alpha),
-                                               static_cast<long long>(xOffset),
-                                               static_cast<long long>(yOffset));
         }
+        else
+        {
+            MIOPEN_THROW(
+                "Activation does not support tensor dimension larger than 4 or smaller than 1");
+        }
+
+        construct_params.setBotDescFromMLDesc(xDesc);
+
+        construct_params.setNeuronDescr(
+            static_cast<int>(mode), activ_gamma, activ_beta, activ_alpha);
+
+        mloConstruct(construct_params);
+
+        std::string program_name     = construct_params.getKernelFile();      // CL kernel filename
+        std::string kernel_name      = construct_params.getKernelName();      // kernel name
+        std::string compiler_options = construct_params.getCompilerOptions(); // kernel parameters
+
+        const std::vector<size_t>& vld = construct_params.getLocalWkSize();
+        const std::vector<size_t>& vgd = construct_params.getGlobalWkSize();
+
+        int imode = mode;
+        construct_params.getNeuronDescr(imode, activ_gamma, activ_beta, activ_alpha);
+
+        auto f_activ_alpha = as_float(activ_alpha);
+        auto f_activ_beta  = as_float(activ_beta);
+        auto f_activ_gamma = as_float(activ_gamma);
+
+        compiler_options +=
+            " -DMIOPEN_N_IN=" + std::to_string(nIn) + " -DMIOPEN_C_IN=" + std::to_string(cIn) +
+            " -DMIOPEN_H_IN=" + std::to_string(hIn) + " -DMIOPEN_W_IN=" + std::to_string(wIn) +
+            " -DMIOPEN_N_IN_STRIDE=" + std::to_string(nInStride) + " -DMIOPEN_C_IN_STRIDE=" +
+            std::to_string(cInStride) + " -DMIOPEN_H_IN_STRIDE=" + std::to_string(hInStride) +
+            " -DMIOPEN_W_IN_STRIDE=" + std::to_string(wInStride) + " -DMIOPEN_N_OUT=" +
+            std::to_string(nOut) + " -DMIOPEN_C_OUT=" + std::to_string(cOut) + " -DMIOPEN_H_OUT=" +
+            std::to_string(hOut) + " -DMIOPEN_W_OUT=" + std::to_string(wOut) +
+            " -DMIOPEN_N_OUT_STRIDE=" + std::to_string(nOutStride) + " -DMIOPEN_C_OUT_STRIDE=" +
+            std::to_string(cOutStride) + " -DMIOPEN_H_OUT_STRIDE=" + std::to_string(hOutStride) +
+            " -DMIOPEN_W_OUT_STRIDE=" + std::to_string(wOutStride) + " -DMIOPEN_N_DIN=" +
+            std::to_string(1) + " -DMIOPEN_C_DIN=" + std::to_string(1) + " -DMIOPEN_H_DIN=" +
+            std::to_string(1) + " -DMIOPEN_W_DIN=" + std::to_string(1) + " -DMIOPEN_N_DIN_STRIDE=" +
+            std::to_string(1) + " -DMIOPEN_C_DIN_STRIDE=" + std::to_string(1) +
+            " -DMIOPEN_H_DIN_STRIDE=" + std::to_string(1) + " -DMIOPEN_W_DIN_STRIDE=" +
+            std::to_string(1) + " -DMIOPEN_N_DOUT=" + std::to_string(1) + " -DMIOPEN_C_DOUT=" +
+            std::to_string(1) + " -DMIOPEN_H_DOUT=" + std::to_string(1) + " -DMIOPEN_W_DOUT=" +
+            std::to_string(1) + " -DMIOPEN_N_DOUT_STRIDE=" + std::to_string(1) +
+            " -DMIOPEN_C_DOUT_STRIDE=" + std::to_string(1) + " -DMIOPEN_H_DOUT_STRIDE=" +
+            std::to_string(1) + " -DMIOPEN_W_DOUT_STRIDE=" + std::to_string(1) +
+            " -DMIOPEN_IN_BLOCK_SZ=" + std::to_string(cIn * hIn * wIn) + " -DMIOPEN_OUT_BLOCK_SZ=" +
+            std::to_string(cOut * hOut * wOut) + " -DMIOPEN_DIN_BLOCK_SZ=" + std::to_string(1) +
+            " -DMIOPEN_DOUT_BLOCK_SZ=" + std::to_string(1);
+
+        handle.AddKernel("miopenActivationForward",
+                         network_config,
+                         program_name,
+                         kernel_name,
+                         vld,
+                         vgd,
+                         compiler_options)(x,
+                                           y,
+                                           as_float(f_activ_gamma),
+                                           as_float(f_activ_beta),
+                                           as_float(f_activ_alpha),
+                                           static_cast<long long>(xOffset),
+                                           static_cast<long long>(yOffset));
     });
     return (status);
 }
diff --git a/src/ocl/convolutionocl.cpp b/src/ocl/convolutionocl.cpp
index e157a5c1cd..5d339f981e 100644
--- a/src/ocl/convolutionocl.cpp
+++ b/src/ocl/convolutionocl.cpp
@@ -958,7 +958,8 @@ static Invoker PrepareInvoker(Handle& handle,
     const auto invoker =
         handle.PrepareInvoker(*solution.invoker_factory, solution.construction_params);
 
-    handle.RegisterInvoker(invoker, config, solver_id, AlgorithmName(solver_id.GetAlgo(dir)));
+    handle.RegisterInvoker(
+        invoker, config, solver_id.ToString(), AlgorithmName(solver_id.GetAlgo(dir)));
     return invoker; // NOLINT (performance-no-automatic-move)
 }
 
diff --git a/src/reducetensor.cpp b/src/reducetensor.cpp
index b3aba09049..6254d1d058 100644
--- a/src/reducetensor.cpp
+++ b/src/reducetensor.cpp
@@ -593,9 +593,9 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
                      std::to_string(compType) + "IN";
     for(auto dimLen : inDescLengths)
         network_config += std::to_string(dimLen) + "_";
-    network_config += "OUT";
-    for(auto dimLen : outDescLengths)
-        network_config += std::to_string(dimLen) + "_";
+    network_config += "RED";
+    for(auto dim : toReduceDims)
+        network_config += std::to_string(dim) + "_";
     network_config += "BSIZE_" + std::to_string(blockSize);
 
     // kernel for the first call
diff --git a/src/solver.cpp b/src/solver.cpp
index f93e225589..d1e038ba05 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -25,8 +25,9 @@
  *******************************************************************************/
 
 #include <miopen/solver.hpp>
-#include <miopen/conv_algo_name.hpp>
 
+#include <miopen/activ/solvers.hpp>
+#include <miopen/conv_algo_name.hpp>
 #include <miopen/db.hpp>
 #include <miopen/solver_id.hpp>
 #include <miopen/par_for.hpp>
@@ -172,10 +173,7 @@ miopenConvAlgorithm_t Id::GetAlgo() const
     return it->second;
 }
 
-inline bool Register(IdRegistryData& registry,
-                     uint64_t value,
-                     const std::string& str,
-                     miopenConvAlgorithm_t algo)
+inline bool Register(IdRegistryData& registry, uint64_t value, const std::string& str)
 {
     if(value == Id::invalid_value)
     {
@@ -204,6 +202,16 @@ inline bool Register(IdRegistryData& registry,
 
     registry.value_to_str.emplace(value, str);
     registry.str_to_value.emplace(str, value);
+    return true;
+}
+
+inline bool Register(IdRegistryData& registry,
+                     uint64_t value,
+                     const std::string& str,
+                     miopenConvAlgorithm_t algo)
+{
+    if(!Register(registry, value, str))
+        return false;
     registry.value_to_algo.emplace(value, algo);
     return true;
 }
@@ -418,6 +426,8 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
     RegisterWithSolver(registry, ++id, ConvMlirIgemmBwdXdlops{}, miopenConvolutionAlgoImplicitGEMM);
     RegisterWithSolver(registry, ++id, ConvMlirIgemmWrWXdlops{}, miopenConvolutionAlgoImplicitGEMM);
 
+    Register(registry, ++id, SolverDbId(activ::ActivFwdSolver0{}));
+
     RegisterWithSolver(registry,
                        ++id,
                        ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC{},
diff --git a/src/solver/activ/fwd_0.cpp b/src/solver/activ/fwd_0.cpp
new file mode 100644
index 0000000000..7b149d3e95
--- /dev/null
+++ b/src/solver/activ/fwd_0.cpp
@@ -0,0 +1,217 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/activ/solvers.hpp>
+
+#include <miopen/activ/invoke_params.hpp>
+#include <miopen/activ/problem_description.hpp>
+#include <miopen/visit_float.hpp>
+#include <miopen/kernel_build_params.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace activ {
+
+bool ActivFwdSolver0::IsApplicable(const ExecutionContext&,
+                                   const miopen::activ::ProblemDescription& problem) const
+{
+    if(problem.GetDirection() != miopen::activ::Direction::Forward)
+        return false;
+
+    // short cut for packed tensors and 2D tensors with stride != width
+    const auto& x_lens = problem.GetXDesc().GetLengths();
+    const auto& y_lens = problem.GetYDesc().GetLengths();
+
+    const auto& x_strides = problem.GetXDesc().GetStrides();
+    const auto& y_strides = problem.GetYDesc().GetStrides();
+
+    const auto x_elem_sz = problem.GetXDesc().GetElementSize();
+    const auto y_elem_sz = problem.GetYDesc().GetElementSize();
+
+    const auto x_stride2D = static_cast<unsigned int>(
+        (x_lens.size() == 2) ? x_strides[0] : (x_lens.size() == 3)
+                                                  ? x_strides[1]
+                                                  : (x_lens.size() == 4) ? x_strides[2]
+                                                                         : x_strides[3]);
+    const auto y_stride2D = static_cast<unsigned int>(
+        (y_lens.size() == 2) ? y_strides[0] : (y_lens.size() == 3)
+                                                  ? y_strides[1]
+                                                  : (y_lens.size() == 4) ? y_strides[2]
+                                                                         : y_strides[3]);
+
+    const auto x_width2D =
+        ((x_lens.size() == 2) ? x_lens[1] : (x_lens.size() == 3) ? x_lens[2] : (x_lens.size() == 4)
+                                                                                   ? x_lens[3]
+                                                                                   : x_lens[4]);
+
+    const auto y_width2D =
+        ((y_lens.size() == 2) ? y_lens[1] : (y_lens.size() == 3) ? y_lens[2] : (y_lens.size() == 4)
+                                                                                   ? y_lens[3]
+                                                                                   : y_lens[4]);
+
+    const auto t2D =
+        (x_lens.size() == y_lens.size() &&
+         ((x_width2D != x_stride2D) || (y_width2D != y_stride2D)) &&
+         (x_lens.size() == 2 || (x_lens.size() == 3 && x_lens[0] == 1 && y_lens[0] == 1) ||
+          (x_lens.size() == 4 && x_lens[0] == 1 && x_lens[1] == 1 && y_lens[0] == 1 &&
+           y_lens[1] == 1) ||
+          (x_lens.size() == 5 && x_lens[0] == 1 && x_lens[1] == 1 && x_lens[2] == 1 &&
+           y_lens[0] == 1 && y_lens[1] == 1 && y_lens[2] == 1)));
+    const auto packed = problem.GetXDesc().IsPacked() && problem.GetYDesc().IsPacked();
+
+    return x_elem_sz == y_elem_sz && (packed || t2D);
+}
+
+ConvSolution ActivFwdSolver0::GetSolution(const ExecutionContext&,
+                                          const miopen::activ::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    // short cut for packed tensors and 2D tensors with stride != width
+    const auto x_lens = problem.GetXDesc().GetLengths();
+    const auto y_lens = problem.GetYDesc().GetLengths();
+
+    const auto x_elem_sz = problem.GetXDesc().GetElementSize();
+
+    const auto x_width2D =
+        ((x_lens.size() == 2) ? x_lens[1] : (x_lens.size() == 3) ? x_lens[2] : (x_lens.size() == 4)
+                                                                                   ? x_lens[3]
+                                                                                   : x_lens[4]);
+
+    const auto packed    = problem.GetXDesc().IsPacked() && problem.GetYDesc().IsPacked();
+    const auto read_len  = (packed) ? x_elem_sz : x_width2D;
+    const auto read_unit = (read_len % 4 == 0) ? 4 : (read_len % 2 == 0) ? 2 : 1;
+
+    const auto READ_TYPE = (read_unit == 1) ? "_FLOAT" : "_FLOAT" + std::to_string(read_unit);
+
+    const auto height =
+        (x_lens.size() == 2) ? x_lens[0] : (x_lens.size() == 3) ? x_lens[1] : (x_lens.size() == 4)
+                                                                                  ? x_lens[2]
+                                                                                  : x_lens[3];
+
+    auto build_params = KernelBuildParameters{
+        {"LITE"},
+        {"MIOPEN_READ_UNIT", read_unit},
+        {"MIOPEN_READ_TYPE", READ_TYPE},
+        {"MIOPEN_NRN_OP_ID", problem.GetActivDesc().GetMode()},
+    };
+
+    if(problem.GetXDesc().GetType() == miopenFloat)
+    {
+        build_params.Define("MIOPEN_USE_FP16", 0);
+        build_params.Define("MIOPEN_USE_FP32", 1);
+    }
+    else if(problem.GetXDesc().GetType() == miopenHalf)
+    {
+        build_params.Define("MIOPEN_USE_FP16", 1);
+        build_params.Define("MIOPEN_USE_FP32", 0);
+    }
+
+    {
+        auto kernel_info         = KernelInfo{};
+        kernel_info.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+
+        kernel_info.l_wk.push_back(256);
+        kernel_info.l_wk.push_back(1);
+        kernel_info.l_wk.push_back(1);
+
+        const auto MAP_RD = read_len / read_unit;
+
+        kernel_info.g_wk.push_back(MAP_RD);
+        kernel_info.g_wk.push_back(packed ? 1 : height);
+        kernel_info.g_wk.push_back(1);
+
+        kernel_info.kernel_file = "MIOpenNeuron.cl";
+        kernel_info.kernel_name = (packed) ? "MIOpenActiveFwdLite" : "MIOpenActiveFwd2DLite";
+
+        result.construction_params.push_back(kernel_info);
+    }
+
+    result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::activ::InvokeParams>();
+
+            visit_float(params.x_desc.GetType(), [&](auto as_float) {
+                const auto alpha = as_float(params.alpha);
+                const auto beta  = as_float(params.beta);
+                const auto gamma = as_float(params.gamma);
+
+                if(packed)
+                {
+                    kernel(params.x,
+                           params.y,
+                           gamma,
+                           beta,
+                           alpha,
+                           static_cast<long long>(params.x_offset),
+                           static_cast<long long>(params.y_offset));
+                }
+                else
+                {
+                    const auto x_lens_ = params.x_desc.GetLengths();
+                    const auto y_lens_ = params.y_desc.GetLengths();
+
+                    const auto x_strides = params.x_desc.GetStrides();
+                    const auto y_strides = params.y_desc.GetStrides();
+
+                    const auto x_stride2D = static_cast<unsigned int>(
+                        (x_lens_.size() == 2) ? x_strides[0] : (x_lens_.size() == 3)
+                                                                   ? x_strides[1]
+                                                                   : (x_lens_.size() == 4)
+                                                                         ? x_strides[2]
+                                                                         : x_strides[3]);
+                    const auto y_stride2D = static_cast<unsigned int>(
+                        (y_lens_.size() == 2) ? y_strides[0] : (y_lens_.size() == 3)
+                                                                   ? y_strides[1]
+                                                                   : (y_lens_.size() == 4)
+                                                                         ? y_strides[2]
+                                                                         : y_strides[3]);
+
+                    kernel(params.x,
+                           params.y,
+                           gamma,
+                           beta,
+                           alpha,
+                           static_cast<long long>(params.x_offset),
+                           static_cast<long long>(params.y_offset),
+                           x_stride2D,
+                           y_stride2D);
+                }
+            });
+        };
+    };
+
+    return result;
+}
+
+} // namespace activ
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/conv_asm_implicit_gemm_wrw_v4r1_dynamic.cpp b/src/solver/conv_asm_implicit_gemm_wrw_v4r1_dynamic.cpp
index 0354ecc9d2..9d4475d3fe 100644
--- a/src/solver/conv_asm_implicit_gemm_wrw_v4r1_dynamic.cpp
+++ b/src/solver/conv_asm_implicit_gemm_wrw_v4r1_dynamic.cpp
@@ -208,7 +208,7 @@ static inline bool FindImplicitGemmWrwV4R1DynamicKernel(const ConvolutionContext
         grid_size  = (GemmM / GemmMPerBlock) *
                     (GemmN / (GemmNRepeat * GemmNPerThreadSubC * GemmN0YXPerBlock)) * GemmKGroups;
 
-        if((ho * wo) % 4 == 0)
+        if((ho * wo) % 16 == 0)
             kernel_name = "igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_4x64";
         else
             kernel_name = "igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_16x16";
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index bedc8f50df..64d09d40ec 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -934,6 +934,8 @@ COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS} $<TARGET_FILE:test_conv2d> --verbose --
 COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input  32  128 34 34 --weights 64  128  3 3 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data
 COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input 128  128 35 35 --weights 128  128 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-forward --disable-backward-data
 COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input 128  256 56 56 --weights 64  256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data
+COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input  64  512 28 28 --weights 256 512 1 1 --pads_strides_dilations 0 0 2 2 1 1 --disable-forward --disable-backward-data
+COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input  64  512 14 14 --weights 256 512 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data
 COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input  64   64 56 56 --weights 256  64  1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights
 COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input  32  128 34 34 --weights 64  128  3 3 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights
 COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input 128  128 35 35 --weights 128  128 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-forward --disable-backward-weights
@@ -987,6 +989,9 @@ COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $<TARGET_FILE:test_conv2d> ${MIO
 COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input  3  256 28 28 --weights 80  256  1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data
 COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input  2  256 12 18 --weights 256  256  3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-forward --disable-backward-data
 COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input  4  512 128 128 --weights 12  512  1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data
+#regression test for issue 540
+COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input  4 32 79 141 --weights 64 32 5 10 --pads_strides_dilations 0 0 2 2 1 1 --disable-forward --disable-backward-data
+
 # WORKAROUND_ISSUE_996
 # COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input  400  256 7 7 --weights 1024  256  7 7 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data
 # WORKAROUND_ISSUE_996
@@ -1121,8 +1126,7 @@ if(MIOPEN_TEST_CONV)
 endif()
 
 if(MIOPEN_TEST_FLOAT)
-# WORKAROUND_SWDEV_291479
-#    add_custom_test(test_reduce_double SKIP_UNLESS_ALL GFX908_ENABLED COMMAND  $<TARGET_FILE:test_reduce_test> --double --all --verbose)
+    add_custom_test(test_reduce_double SKIP_UNLESS_ALL GFX908_ENABLED COMMAND  $<TARGET_FILE:test_reduce_test> --double --all --verbose)
 endif()
 
 # Add here regression tests that should be run only on Vega10/20 and only with FP16.
diff --git a/test/reduce_test.cpp b/test/reduce_test.cpp
index 324e7933fd..9c3f678eda 100644
--- a/test/reduce_test.cpp
+++ b/test/reduce_test.cpp
@@ -820,11 +820,21 @@ struct reduce_driver : test_driver
             max_value =
                 miopen_type<T>{} == miopenHalf ? 13 : miopen_type<T>{} == miopenInt8 ? 127 : 17;
 
+        // default data gneration (used by MIN/MAX)
         auto gen_value = [&](auto... is) {
             return (tensor_elem_gen_integer{max_value}(is...) *
                     tensor_elem_gen_checkboard_sign{}(is...));
         };
 
+        // data generation used by ADD/AVG, data is distributed around 1.0 rather than 0.0, very low
+        // probability to get a reduced result of zero-value
+        auto gen_value_1 = [&](auto... is) {
+            auto rand_value = tensor_elem_gen_integer{max_value}(is...);
+            auto sign_value = tensor_elem_gen_checkboard_sign{}(is...);
+
+            return (sign_value * rand_value + 1.0);
+        };
+
         // Special data generation for MUL, to avoid all-zero and large accumulative error in the
         // reduced result
         auto gen_value_2 = [&](auto... is) {
@@ -835,12 +845,7 @@ struct reduce_driver : test_driver
                                     : (rand_value + max_value + 1) / (rand_value + max_value);
         };
 
-        bool need_indices =
-            ((reduceOp == MIOPEN_REDUCE_TENSOR_MIN || reduceOp == MIOPEN_REDUCE_TENSOR_MAX ||
-              reduceOp == MIOPEN_REDUCE_TENSOR_AMAX) &&
-             indicesOpt != MIOPEN_REDUCE_TENSOR_NO_INDICES);
-
-        // Special data generation for MIN/MAX/AMAX using a space of limitless number of values.
+        // Special data generation for NORM1 and NORM2 using a space of limitless number of values.
         // This method is slower due to the use of rand(), it is usually used for manual testing
         auto gen_value_3 = [&](auto... is) {
             auto rand_upper   = tensor_elem_gen_integer{max_value}(is...);
@@ -850,6 +855,14 @@ struct reduce_driver : test_driver
             return rand_upper * sign_value * rand_ratio;
         };
 
+        // Special data generation for AMAX, no zero value used
+        auto gen_value_4 = [&](auto... is) {
+            auto rand_value = tensor_elem_gen_integer{max_value}(is...);
+            auto sign_value = tensor_elem_gen_checkboard_sign{}(is...);
+
+            return sign_value > 0.0 ? (rand_value + 0.5) : (-1.0 * rand_value - 0.5);
+        };
+
         // default tolerance (refer to driver.hpp)
         this->tolerance = 80;
 
@@ -866,12 +879,27 @@ struct reduce_driver : test_driver
         if(std::is_same<T, half_float::half>::value)
             this->tolerance *= this->tolerance * 10.0;
 
-        auto inputTensor = (reduceOp == MIOPEN_REDUCE_TENSOR_MUL)
-                               ? tensor<T>{this->inLengths}.generate(gen_value_2)
-                               : (need_indices || reduceOp == MIOPEN_REDUCE_TENSOR_NORM1 ||
-                                          reduceOp == MIOPEN_REDUCE_TENSOR_NORM2
-                                      ? tensor<T>{this->inLengths}.generate(gen_value_3)
-                                      : tensor<T>{this->inLengths}.generate(gen_value));
+        tensor<T> inputTensor;
+
+        switch(reduceOp)
+        {
+        case MIOPEN_REDUCE_TENSOR_ADD:
+        case MIOPEN_REDUCE_TENSOR_AVG:
+            inputTensor = tensor<T>{this->inLengths}.generate(gen_value_1);
+            break;
+        case MIOPEN_REDUCE_TENSOR_MUL:
+            inputTensor = tensor<T>{this->inLengths}.generate(gen_value_2);
+            break;
+        case MIOPEN_REDUCE_TENSOR_NORM1:
+        case MIOPEN_REDUCE_TENSOR_NORM2:
+            inputTensor = tensor<T>{this->inLengths}.generate(gen_value_3);
+            break;
+        case MIOPEN_REDUCE_TENSOR_AMAX:
+            inputTensor = tensor<T>{this->inLengths}.generate(gen_value_4);
+            break;
+        default: inputTensor = tensor<T>{this->inLengths}.generate(gen_value);
+        };
+
         auto outputTensor = tensor<T>{outLengths};
 
         std::fill(outputTensor.begin(), outputTensor.end(), convert_type<T>(0.0f));