diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/reduction_functions.hpp b/src/kernels/composable_kernel/include/kernel_algorithm/reduction_functions.hpp
index 4ed6bb6fd2..8772270693 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/reduction_functions.hpp
+++ b/src/kernels/composable_kernel/include/kernel_algorithm/reduction_functions.hpp
@@ -50,10 +50,12 @@ struct binop_with_nan_check<NanPropagation_t::NOT_PROPAGATE_NAN, opReduce, compT
     };
 
     // The method is called when the opReduce is indexable and the user asked for indices
-    __device__ static inline void
-    calculate(const compType& accuVal, compType currVal, int& accuIndex, int currIndex)
+    __device__ static inline void calculate(const compType& accuVal,
+                                            compType currVal,
+                                            VOLATILE_WA_274384 int& accuIndex,
+                                            int currIndex)
     {
-        bool changed = false;
+        VOLATILE_WA_274384 bool changed = false;
 
         opReduce{}(const_cast<compType&>(accuVal), currVal, changed);
 
@@ -75,7 +77,7 @@ struct binop_with_nan_check<NanPropagation_t::PROPAGATE_NAN, opReduce, compType>
 
     // The method is called when the opReduce is indexable and the user asked for indices
     __device__ static inline void
-    calculate(compType& accuVal, compType currVal, int& accuIndex, int currIndex)
+    calculate(compType& accuVal, compType currVal, VOLATILE_WA_274384 int& accuIndex, int currIndex)
     {
         if(isnan(currVal))
         {
@@ -84,7 +86,7 @@ struct binop_with_nan_check<NanPropagation_t::PROPAGATE_NAN, opReduce, compType>
         }
         else
         {
-            bool changed = false;
+            VOLATILE_WA_274384 bool changed = false;
 
             opReduce{}(accuVal, currVal, changed);
 
@@ -527,9 +529,9 @@ struct BlockwiseReduction_2d_block_buffer
                                    compType& accuData,
                                    int& accuIndex)
     {
-        const index_t thread_local_id = get_thread_local_1d_id();
-        compType lAccuData            = opReduce::GetZeroVal();
-        int lAccuIndex                = 0;
+        const index_t thread_local_id     = get_thread_local_1d_id();
+        compType lAccuData                = opReduce::GetZeroVal();
+        VOLATILE_WA_274384 int lAccuIndex = 0;
 
         static_if<blockIsOneRow>{}([&](auto) {
             for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
diff --git a/src/kernels/composable_kernel/include/utility/reduction_common.hpp b/src/kernels/composable_kernel/include/utility/reduction_common.hpp
index fc389efcb2..e1c5c2f2df 100644
--- a/src/kernels/composable_kernel/include/utility/reduction_common.hpp
+++ b/src/kernels/composable_kernel/include/utility/reduction_common.hpp
@@ -28,6 +28,14 @@
 
 #include "float_type.hpp"
 
+#define WORKAROUND_SWDEV_274384 (HIP_PACKAGE_VERSION_FLAT >= 4002021203ULL)
+
+#if WORKAROUND_SWDEV_274384
+#define VOLATILE_WA_274384 volatile
+#else
+#define VOLATILE_WA_274384
+#endif
+
 // this enumerate should be synchronized with include/miopen/reduce_common.hpp
 namespace ck {
 enum class ReductionMethod_t
diff --git a/src/kernels/composable_kernel/include/utility/reduction_operator.hpp b/src/kernels/composable_kernel/include/utility/reduction_operator.hpp
index 3019cbd9e4..a9c041895d 100644
--- a/src/kernels/composable_kernel/include/utility/reduction_operator.hpp
+++ b/src/kernels/composable_kernel/include/utility/reduction_operator.hpp
@@ -93,15 +93,13 @@ struct Max
             a = b;
     }
 
-    __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
+    __device__ inline constexpr void operator()(T& a, T b, VOLATILE_WA_274384 bool& changed) const
     {
         if(a < b)
         {
             a       = b;
             changed = true;
         }
-        else
-            changed = false;
     }
 
     static constexpr bool indexable = true;
@@ -120,15 +118,13 @@ struct Min
             a = b;
     }
 
-    __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
+    __device__ inline constexpr void operator()(T& a, T b, VOLATILE_WA_274384 bool& changed) const
     {
         if(a > b)
         {
             a       = b;
             changed = true;
         }
-        else
-            changed = false;
     }
 
     static constexpr bool indexable = true;
diff --git a/src/reducetensor.cpp b/src/reducetensor.cpp
index b3aba09049..6254d1d058 100644
--- a/src/reducetensor.cpp
+++ b/src/reducetensor.cpp
@@ -593,9 +593,9 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
                      std::to_string(compType) + "IN";
     for(auto dimLen : inDescLengths)
         network_config += std::to_string(dimLen) + "_";
-    network_config += "OUT";
-    for(auto dimLen : outDescLengths)
-        network_config += std::to_string(dimLen) + "_";
+    network_config += "RED";
+    for(auto dim : toReduceDims)
+        network_config += std::to_string(dim) + "_";
     network_config += "BSIZE_" + std::to_string(blockSize);
 
     // kernel for the first call
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 4f812b985f..c2c45edf22 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1085,8 +1085,7 @@ if(MIOPEN_TEST_CONV)
 endif()
 
 if(MIOPEN_TEST_FLOAT)
-# WORKAROUND_SWDEV_291479
-#    add_custom_test(test_reduce_double SKIP_UNLESS_ALL GFX908_ENABLED COMMAND  $<TARGET_FILE:test_reduce_test> --double --all --verbose)
+    add_custom_test(test_reduce_double SKIP_UNLESS_ALL GFX908_ENABLED COMMAND  $<TARGET_FILE:test_reduce_test> --double --all --verbose)
 endif()
 
 # Add here regression tests that should be run only on Vega10/20 and only with FP16.
diff --git a/test/reduce_test.cpp b/test/reduce_test.cpp
index 324e7933fd..9c3f678eda 100644
--- a/test/reduce_test.cpp
+++ b/test/reduce_test.cpp
@@ -820,11 +820,21 @@ struct reduce_driver : test_driver
             max_value =
                 miopen_type<T>{} == miopenHalf ? 13 : miopen_type<T>{} == miopenInt8 ? 127 : 17;
 
+        // default data gneration (used by MIN/MAX)
         auto gen_value = [&](auto... is) {
             return (tensor_elem_gen_integer{max_value}(is...) *
                     tensor_elem_gen_checkboard_sign{}(is...));
         };
 
+        // data generation used by ADD/AVG, data is distributed around 1.0 rather than 0.0, very low
+        // probability to get a reduced result of zero-value
+        auto gen_value_1 = [&](auto... is) {
+            auto rand_value = tensor_elem_gen_integer{max_value}(is...);
+            auto sign_value = tensor_elem_gen_checkboard_sign{}(is...);
+
+            return (sign_value * rand_value + 1.0);
+        };
+
         // Special data generation for MUL, to avoid all-zero and large accumulative error in the
         // reduced result
         auto gen_value_2 = [&](auto... is) {
@@ -835,12 +845,7 @@ struct reduce_driver : test_driver
                                     : (rand_value + max_value + 1) / (rand_value + max_value);
         };
 
-        bool need_indices =
-            ((reduceOp == MIOPEN_REDUCE_TENSOR_MIN || reduceOp == MIOPEN_REDUCE_TENSOR_MAX ||
-              reduceOp == MIOPEN_REDUCE_TENSOR_AMAX) &&
-             indicesOpt != MIOPEN_REDUCE_TENSOR_NO_INDICES);
-
-        // Special data generation for MIN/MAX/AMAX using a space of limitless number of values.
+        // Special data generation for NORM1 and NORM2 using a space of limitless number of values.
         // This method is slower due to the use of rand(), it is usually used for manual testing
         auto gen_value_3 = [&](auto... is) {
             auto rand_upper   = tensor_elem_gen_integer{max_value}(is...);
@@ -850,6 +855,14 @@ struct reduce_driver : test_driver
             return rand_upper * sign_value * rand_ratio;
         };
 
+        // Special data generation for AMAX, no zero value used
+        auto gen_value_4 = [&](auto... is) {
+            auto rand_value = tensor_elem_gen_integer{max_value}(is...);
+            auto sign_value = tensor_elem_gen_checkboard_sign{}(is...);
+
+            return sign_value > 0.0 ? (rand_value + 0.5) : (-1.0 * rand_value - 0.5);
+        };
+
         // default tolerance (refer to driver.hpp)
         this->tolerance = 80;
 
@@ -866,12 +879,27 @@ struct reduce_driver : test_driver
         if(std::is_same<T, half_float::half>::value)
             this->tolerance *= this->tolerance * 10.0;
 
-        auto inputTensor = (reduceOp == MIOPEN_REDUCE_TENSOR_MUL)
-                               ? tensor<T>{this->inLengths}.generate(gen_value_2)
-                               : (need_indices || reduceOp == MIOPEN_REDUCE_TENSOR_NORM1 ||
-                                          reduceOp == MIOPEN_REDUCE_TENSOR_NORM2
-                                      ? tensor<T>{this->inLengths}.generate(gen_value_3)
-                                      : tensor<T>{this->inLengths}.generate(gen_value));
+        tensor<T> inputTensor;
+
+        switch(reduceOp)
+        {
+        case MIOPEN_REDUCE_TENSOR_ADD:
+        case MIOPEN_REDUCE_TENSOR_AVG:
+            inputTensor = tensor<T>{this->inLengths}.generate(gen_value_1);
+            break;
+        case MIOPEN_REDUCE_TENSOR_MUL:
+            inputTensor = tensor<T>{this->inLengths}.generate(gen_value_2);
+            break;
+        case MIOPEN_REDUCE_TENSOR_NORM1:
+        case MIOPEN_REDUCE_TENSOR_NORM2:
+            inputTensor = tensor<T>{this->inLengths}.generate(gen_value_3);
+            break;
+        case MIOPEN_REDUCE_TENSOR_AMAX:
+            inputTensor = tensor<T>{this->inLengths}.generate(gen_value_4);
+            break;
+        default: inputTensor = tensor<T>{this->inLengths}.generate(gen_value);
+        };
+
         auto outputTensor = tensor<T>{outLengths};
 
         std::fill(outputTensor.begin(), outputTensor.end(), convert_type<T>(0.0f));