diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/reduction_functions.hpp b/src/kernels/composable_kernel/include/kernel_algorithm/reduction_functions.hpp index 4ed6bb6fd2..8772270693 100644 --- a/src/kernels/composable_kernel/include/kernel_algorithm/reduction_functions.hpp +++ b/src/kernels/composable_kernel/include/kernel_algorithm/reduction_functions.hpp @@ -50,10 +50,12 @@ struct binop_with_nan_check(accuVal), currVal, changed); @@ -75,7 +77,7 @@ struct binop_with_nan_check // The method is called when the opReduce is indexable and the user asked for indices __device__ static inline void - calculate(compType& accuVal, compType currVal, int& accuIndex, int currIndex) + calculate(compType& accuVal, compType currVal, VOLATILE_WA_274384 int& accuIndex, int currIndex) { if(isnan(currVal)) { @@ -84,7 +86,7 @@ struct binop_with_nan_check } else { - bool changed = false; + VOLATILE_WA_274384 bool changed = false; opReduce{}(accuVal, currVal, changed); @@ -527,9 +529,9 @@ struct BlockwiseReduction_2d_block_buffer compType& accuData, int& accuIndex) { - const index_t thread_local_id = get_thread_local_1d_id(); - compType lAccuData = opReduce::GetZeroVal(); - int lAccuIndex = 0; + const index_t thread_local_id = get_thread_local_1d_id(); + compType lAccuData = opReduce::GetZeroVal(); + VOLATILE_WA_274384 int lAccuIndex = 0; static_if{}([&](auto) { for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++) diff --git a/src/kernels/composable_kernel/include/utility/reduction_common.hpp b/src/kernels/composable_kernel/include/utility/reduction_common.hpp index fc389efcb2..e1c5c2f2df 100644 --- a/src/kernels/composable_kernel/include/utility/reduction_common.hpp +++ b/src/kernels/composable_kernel/include/utility/reduction_common.hpp @@ -28,6 +28,14 @@ #include "float_type.hpp" +#define WORKAROUND_SWDEV_274384 (HIP_PACKAGE_VERSION_FLAT >= 4002021203ULL) + +#if WORKAROUND_SWDEV_274384 +#define VOLATILE_WA_274384 volatile +#else +#define VOLATILE_WA_274384 +#endif + // this enumerate should be synchronized with include/miopen/reduce_common.hpp namespace ck { enum class ReductionMethod_t diff --git a/src/kernels/composable_kernel/include/utility/reduction_operator.hpp b/src/kernels/composable_kernel/include/utility/reduction_operator.hpp index 3019cbd9e4..a9c041895d 100644 --- a/src/kernels/composable_kernel/include/utility/reduction_operator.hpp +++ b/src/kernels/composable_kernel/include/utility/reduction_operator.hpp @@ -93,15 +93,13 @@ struct Max a = b; } - __device__ inline constexpr void operator()(T& a, T b, bool& changed) const + __device__ inline constexpr void operator()(T& a, T b, VOLATILE_WA_274384 bool& changed) const { if(a < b) { a = b; changed = true; } - else - changed = false; } static constexpr bool indexable = true; @@ -120,15 +118,13 @@ struct Min a = b; } - __device__ inline constexpr void operator()(T& a, T b, bool& changed) const + __device__ inline constexpr void operator()(T& a, T b, VOLATILE_WA_274384 bool& changed) const { if(a > b) { a = b; changed = true; } - else - changed = false; } static constexpr bool indexable = true; diff --git a/src/reducetensor.cpp b/src/reducetensor.cpp index b3aba09049..6254d1d058 100644 --- a/src/reducetensor.cpp +++ b/src/reducetensor.cpp @@ -593,9 +593,9 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle, std::to_string(compType) + "IN"; for(auto dimLen : inDescLengths) network_config += std::to_string(dimLen) + "_"; - network_config += "OUT"; - for(auto dimLen : outDescLengths) - network_config += std::to_string(dimLen) + "_"; + network_config += "RED"; + for(auto dim : toReduceDims) + network_config += std::to_string(dim) + "_"; network_config += "BSIZE_" + std::to_string(blockSize); // kernel for the first call diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 4f812b985f..c2c45edf22 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1085,8 +1085,7 @@ if(MIOPEN_TEST_CONV) endif() if(MIOPEN_TEST_FLOAT) -# WORKAROUND_SWDEV_291479 -# add_custom_test(test_reduce_double SKIP_UNLESS_ALL GFX908_ENABLED COMMAND $ --double --all --verbose) + add_custom_test(test_reduce_double SKIP_UNLESS_ALL GFX908_ENABLED COMMAND $ --double --all --verbose) endif() # Add here regression tests that should be run only on Vega10/20 and only with FP16. diff --git a/test/reduce_test.cpp b/test/reduce_test.cpp index 324e7933fd..9c3f678eda 100644 --- a/test/reduce_test.cpp +++ b/test/reduce_test.cpp @@ -820,11 +820,21 @@ struct reduce_driver : test_driver max_value = miopen_type{} == miopenHalf ? 13 : miopen_type{} == miopenInt8 ? 127 : 17; + // default data gneration (used by MIN/MAX) auto gen_value = [&](auto... is) { return (tensor_elem_gen_integer{max_value}(is...) * tensor_elem_gen_checkboard_sign{}(is...)); }; + // data generation used by ADD/AVG, data is distributed around 1.0 rather than 0.0, very low + // probability to get a reduced result of zero-value + auto gen_value_1 = [&](auto... is) { + auto rand_value = tensor_elem_gen_integer{max_value}(is...); + auto sign_value = tensor_elem_gen_checkboard_sign{}(is...); + + return (sign_value * rand_value + 1.0); + }; + // Special data generation for MUL, to avoid all-zero and large accumulative error in the // reduced result auto gen_value_2 = [&](auto... is) { @@ -835,12 +845,7 @@ struct reduce_driver : test_driver : (rand_value + max_value + 1) / (rand_value + max_value); }; - bool need_indices = - ((reduceOp == MIOPEN_REDUCE_TENSOR_MIN || reduceOp == MIOPEN_REDUCE_TENSOR_MAX || - reduceOp == MIOPEN_REDUCE_TENSOR_AMAX) && - indicesOpt != MIOPEN_REDUCE_TENSOR_NO_INDICES); - - // Special data generation for MIN/MAX/AMAX using a space of limitless number of values. + // Special data generation for NORM1 and NORM2 using a space of limitless number of values. // This method is slower due to the use of rand(), it is usually used for manual testing auto gen_value_3 = [&](auto... is) { auto rand_upper = tensor_elem_gen_integer{max_value}(is...); @@ -850,6 +855,14 @@ struct reduce_driver : test_driver return rand_upper * sign_value * rand_ratio; }; + // Special data generation for AMAX, no zero value used + auto gen_value_4 = [&](auto... is) { + auto rand_value = tensor_elem_gen_integer{max_value}(is...); + auto sign_value = tensor_elem_gen_checkboard_sign{}(is...); + + return sign_value > 0.0 ? (rand_value + 0.5) : (-1.0 * rand_value - 0.5); + }; + // default tolerance (refer to driver.hpp) this->tolerance = 80; @@ -866,12 +879,27 @@ struct reduce_driver : test_driver if(std::is_same::value) this->tolerance *= this->tolerance * 10.0; - auto inputTensor = (reduceOp == MIOPEN_REDUCE_TENSOR_MUL) - ? tensor{this->inLengths}.generate(gen_value_2) - : (need_indices || reduceOp == MIOPEN_REDUCE_TENSOR_NORM1 || - reduceOp == MIOPEN_REDUCE_TENSOR_NORM2 - ? tensor{this->inLengths}.generate(gen_value_3) - : tensor{this->inLengths}.generate(gen_value)); + tensor inputTensor; + + switch(reduceOp) + { + case MIOPEN_REDUCE_TENSOR_ADD: + case MIOPEN_REDUCE_TENSOR_AVG: + inputTensor = tensor{this->inLengths}.generate(gen_value_1); + break; + case MIOPEN_REDUCE_TENSOR_MUL: + inputTensor = tensor{this->inLengths}.generate(gen_value_2); + break; + case MIOPEN_REDUCE_TENSOR_NORM1: + case MIOPEN_REDUCE_TENSOR_NORM2: + inputTensor = tensor{this->inLengths}.generate(gen_value_3); + break; + case MIOPEN_REDUCE_TENSOR_AMAX: + inputTensor = tensor{this->inLengths}.generate(gen_value_4); + break; + default: inputTensor = tensor{this->inLengths}.generate(gen_value); + }; + auto outputTensor = tensor{outLengths}; std::fill(outputTensor.begin(), outputTensor.end(), convert_type(0.0f));