From 4683706bf172841f57b838f784e80b1a91fa3bb4 Mon Sep 17 00:00:00 2001 From: Daniel Lowell Date: Thu, 19 Dec 2019 17:42:38 -0600 Subject: [PATCH 01/15] Remove space in doc. --- doc/src/releasenotes.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/src/releasenotes.md b/doc/src/releasenotes.md index 9898707de3..37446ee731 100644 --- a/doc/src/releasenotes.md +++ b/doc/src/releasenotes.md @@ -29,7 +29,7 @@ Changes: -### 09/25/2019 [2.1.0] +### 09/25/2019 [ 2.1.0 ] - This release contains new layers, bug fixes, and a new convolution algorithm. From 743c46da06f161a60bbc3e4c75c16880a544ad76 Mon Sep 17 00:00:00 2001 From: zjing14 Date: Thu, 2 Jan 2020 14:14:43 -0600 Subject: [PATCH 02/15] Add file fix. --- .../include/utility/amd_xdlops.hpp | 720 ++++++++++++++++-- 1 file changed, 674 insertions(+), 46 deletions(-) diff --git a/src/kernels/composable_kernel/include/utility/amd_xdlops.hpp b/src/kernels/composable_kernel/include/utility/amd_xdlops.hpp index 0f3f5dedac..24e73cb179 100644 --- a/src/kernels/composable_kernel/include/utility/amd_xdlops.hpp +++ b/src/kernels/composable_kernel/include/utility/amd_xdlops.hpp @@ -12,17 +12,44 @@ extern "C" __device__ float32_t llvm_intrin_amdgcn_mfma_f32_32x32x1f32( extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_32x32x2f32( float, float, float16_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.32x32x2f32"); +extern "C" __device__ float4_t llvm_intrin_amdgcn_mfma_f32_16x16x4f32( + float, float, float4_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.16x16x4f32"); + +extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_16x16x1f32( + float, float, float16_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.16x16x1f32"); + +extern "C" __device__ float4_t llvm_intrin_amdgcn_mfma_f32_4x4x1f32( + float, float, float4_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.4x4x1f32"); + extern "C" __device__ float32_t llvm_intrin_amdgcn_mfma_f32_32x32x4f16( half4_t, half4_t, float32_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.32x32x4f16"); extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_32x32x8f16( half4_t, half4_t, float16_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.32x32x8f16"); +extern "C" __device__ float4_t llvm_intrin_amdgcn_mfma_f32_16x16x16f16( + half4_t, half4_t, float4_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.16x16x16f16"); + +extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_16x16x4f16( + half4_t, half4_t, float16_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.16x16x4f16"); + +extern "C" __device__ float4_t llvm_intrin_amdgcn_mfma_f32_4x4x4f16( + half4_t, half4_t, float4_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.4x4x1f16"); + extern "C" __device__ float32_t llvm_intrin_amdgcn_mfma_f32_32x32x2bf16( ushort2_t, ushort2_t, float32_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.32x32x2bf16"); extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_32x32x4bf16( ushort2_t, ushort2_t, float16_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.32x32x4bf16"); + +extern "C" __device__ float4_t llvm_intrin_amdgcn_mfma_f32_16x16x8bf16( + ushort2_t, ushort2_t, float4_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.16x16x8bf16"); + +extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_16x16x2bf16( + ushort2_t, ushort2_t, float16_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.16x16x2bf16"); + +extern "C" __device__ float4_t llvm_intrin_amdgcn_mfma_f32_4x4x2bf16( + ushort2_t, ushort2_t, float4_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.4x4x2bf16"); // clang-format off #define REPEATx4(f, off) f(off) f(off + 1) f(off + 2) f(off + 3) @@ -46,7 +73,9 @@ extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_32x32x4bf16( REPEAT_STRIDEx16(f, stride, off + 2 * stride * 16) \ REPEAT_STRIDEx16(f, stride, off + 3 * stride * 16) -#define NOP(n) asm volatile("\n s_nop " #n " " : :); +#define S_NOP(n) \ + static_assert((n) >=0 && (n) <= 15, "s_nop operand must be within [0..15]"); \ + asm volatile("\n s_nop " #n " " : :); #define MFMA_F32_32x32x1F32(acc, reg_a, reg_b, cbsz, abid, blgp) \ asm volatile("v_mfma_f32_32x32x1f32 a[" #acc ":" #acc "+31], %0, %1, a[" #acc ":" #acc \ @@ -60,6 +89,24 @@ extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_32x32x4bf16( : \ : "v"(reg_a), "v"(reg_b)); +#define MFMA_F32_16x16x4F32(acc, reg_a, reg_b, cbsz, abid, blgp) \ + asm volatile("v_mfma_f32_16x16x4f32 a[" #acc ":" #acc "+3], %0, %1, a[" #acc ":" #acc \ + "+3] cbsz: " #cbsz " abid: " #abid " blgp:" #blgp " " \ + : \ + : "v"(reg_a), "v"(reg_b)); + +#define MFMA_F32_16x16x1F32(acc, reg_a, reg_b, cbsz, abid, blgp) \ + asm volatile("v_mfma_f32_16x16x1f32 a[" #acc ":" #acc "+15], %0, %1, a[" #acc ":" #acc \ + "+15] cbsz: " #cbsz " abid: " #abid " blgp:" #blgp " " \ + : \ + : "v"(reg_a), "v"(reg_b)); + +#define MFMA_F32_4x4x1F32(acc, reg_a, reg_b, cbsz, abid, blgp) \ + asm volatile("v_mfma_f32_4x4x1f32 a[" #acc ":" #acc "+3], %0, %1, a[" #acc ":" #acc \ + "+3] cbsz: " #cbsz " abid: " #abid " blgp:" #blgp " " \ + : \ + : "v"(reg_a), "v"(reg_b)); + #define MFMA_F32_32x32x4F16(acc, reg_a, reg_b, cbsz, abid, blgp) \ asm volatile("v_mfma_f32_32x32x4f16 a[" #acc ":" #acc "+31], %0, %1, a[" #acc ":" #acc \ "+31] cbsz: " #cbsz " abid: " #abid " blgp:" #blgp " " \ @@ -72,6 +119,24 @@ extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_32x32x4bf16( : \ : "v"(reg_a), "v"(reg_b)); +#define MFMA_F32_16x16x16F16(acc, reg_a, reg_b, cbsz, abid, blgp) \ + asm volatile("v_mfma_f32_16x16x16f16 a[" #acc ":" #acc "+3], %0, %1, a[" #acc ":" #acc \ + "+3] cbsz: " #cbsz " abid: " #abid " blgp:" #blgp " " \ + : \ + : "v"(reg_a), "v"(reg_b)); + +#define MFMA_F32_16x16x4F16(acc, reg_a, reg_b, cbsz, abid, blgp) \ + asm volatile("v_mfma_f32_16x16x4f16 a[" #acc ":" #acc "+15], %0, %1, a[" #acc ":" #acc \ + "+15] cbsz: " #cbsz " abid: " #abid " blgp:" #blgp " " \ + : \ + : "v"(reg_a), "v"(reg_b)); + +#define MFMA_F32_4x4x4F16(acc, reg_a, reg_b, cbsz, abid, blgp) \ + asm volatile("v_mfma_f32_4x4x4f16 a[" #acc ":" #acc "+3], %0, %1, a[" #acc ":" #acc \ + "+3] cbsz: " #cbsz " abid: " #abid " blgp:" #blgp " " \ + : \ + : "v"(reg_a), "v"(reg_b)); + #define MFMA_F32_32x32x2BF16(acc, reg_a, reg_b, cbsz, abid, blgp) \ asm volatile("v_mfma_f32_32x32x2bf16 a[" #acc ":" #acc "+31], %0, %1, a[" #acc ":" #acc \ "+31] cbsz: " #cbsz " abid: " #abid " blgp:" #blgp " " \ @@ -84,6 +149,24 @@ extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_32x32x4bf16( : \ : "v"(reg_a), "v"(reg_b)); +#define MFMA_F32_16x16x8BF16(acc, reg_a, reg_b, cbsz, abid, blgp) \ + asm volatile("v_mfma_f32_16x16x8bf16 a[" #acc ":" #acc "+3], %0, %1, a[" #acc ":" #acc \ + "+3] cbsz: " #cbsz " abid: " #abid " blgp:" #blgp " " \ + : \ + : "v"(reg_a), "v"(reg_b)); + +#define MFMA_F32_16x16x2BF16(acc, reg_a, reg_b, cbsz, abid, blgp) \ + asm volatile("v_mfma_f32_16x16x2bf16 a[" #acc ":" #acc "+15], %0, %1, a[" #acc ":" #acc \ + "+15] cbsz: " #cbsz " abid: " #abid " blgp:" #blgp " " \ + : \ + : "v"(reg_a), "v"(reg_b)); + +#define MFMA_F32_4x4x2BF16(acc, reg_a, reg_b, cbsz, abid, blgp) \ + asm volatile("v_mfma_f32_4x4x2bf16 a[" #acc ":" #acc "+3], %0, %1, a[" #acc ":" #acc \ + "+3] cbsz: " #cbsz " abid: " #abid " blgp:" #blgp " " \ + : \ + : "v"(reg_a), "v"(reg_b)); + #define ACCVGPR_READ(acc_reg_id) \ asm volatile("v_accvgpr_read_b32 %0, a[" #acc_reg_id "]" : "=v"(arch_reg[acc_reg_id]) :); @@ -96,13 +179,87 @@ extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_32x32x4bf16( template __device__ void gcnasm_accvgpr_read(float*); +template <> +__device__ void gcnasm_accvgpr_read<4>(float* arch_reg) +{ +#if CK_USE_AMD_XDLOPS_INLINE_ASM + REPEATx4(ACCVGPR_READ, 0) +#else + (void)arch_reg; +#endif +} + +template <> +__device__ void gcnasm_accvgpr_read<8>(float* arch_reg) +{ +#if CK_USE_AMD_XDLOPS_INLINE_ASM + asm volatile("\ + v_accvgpr_read_b32 %0, a[ 0] \n \ + v_accvgpr_read_b32 %1, a[ 1] \n \ + v_accvgpr_read_b32 %2, a[ 2] \n \ + v_accvgpr_read_b32 %3, a[ 3] \n \ + v_accvgpr_read_b32 %4, a[ 4] \n \ + v_accvgpr_read_b32 %5, a[ 5] \n \ + v_accvgpr_read_b32 %6, a[ 6] \n \ + v_accvgpr_read_b32 %7, a[ 7] \n \ + " + : + "=v"(arch_reg[ 0]), + "=v"(arch_reg[ 1]), + "=v"(arch_reg[ 2]), + "=v"(arch_reg[ 3]), + "=v"(arch_reg[ 4]), + "=v"(arch_reg[ 5]), + "=v"(arch_reg[ 6]), + "=v"(arch_reg[ 7]) + :); + +#else + (void)arch_reg; +#endif +} template <> __device__ void gcnasm_accvgpr_read<16>(float* arch_reg) { #if CK_USE_AMD_XDLOPS_INLINE_ASM - NOP(16) - REPEATx16(ACCVGPR_READ, 0) + asm volatile("\ + v_accvgpr_read_b32 %0, a[ 0] \n \ + v_accvgpr_read_b32 %1, a[ 1] \n \ + v_accvgpr_read_b32 %2, a[ 2] \n \ + v_accvgpr_read_b32 %3, a[ 3] \n \ + v_accvgpr_read_b32 %4, a[ 4] \n \ + v_accvgpr_read_b32 %5, a[ 5] \n \ + v_accvgpr_read_b32 %6, a[ 6] \n \ + v_accvgpr_read_b32 %7, a[ 7] \n \ + v_accvgpr_read_b32 %8, a[ 8] \n \ + v_accvgpr_read_b32 %9, a[ 9] \n \ + v_accvgpr_read_b32 %10, a[10] \n \ + v_accvgpr_read_b32 %11, a[11] \n \ + v_accvgpr_read_b32 %12, a[12] \n \ + v_accvgpr_read_b32 %13, a[13] \n \ + v_accvgpr_read_b32 %14, a[14] \n \ + v_accvgpr_read_b32 %15, a[15] \n \ + " + : + "=v"(arch_reg[ 0]), + "=v"(arch_reg[ 1]), + "=v"(arch_reg[ 2]), + "=v"(arch_reg[ 3]), + "=v"(arch_reg[ 4]), + "=v"(arch_reg[ 5]), + "=v"(arch_reg[ 6]), + "=v"(arch_reg[ 7]), + "=v"(arch_reg[ 8]), + "=v"(arch_reg[ 9]), + "=v"(arch_reg[10]), + "=v"(arch_reg[11]), + "=v"(arch_reg[12]), + "=v"(arch_reg[13]), + "=v"(arch_reg[14]), + "=v"(arch_reg[15]) + :); + #else (void)arch_reg; #endif @@ -112,9 +269,74 @@ template <> __device__ void gcnasm_accvgpr_read<32>(float* arch_reg) { #if CK_USE_AMD_XDLOPS_INLINE_ASM - NOP(16) - REPEATx16(ACCVGPR_READ, 0) - REPEATx16(ACCVGPR_READ, 16) + asm volatile("\ + v_accvgpr_read_b32 %0, a[ 0] \n \ + v_accvgpr_read_b32 %1, a[ 1] \n \ + v_accvgpr_read_b32 %2, a[ 2] \n \ + v_accvgpr_read_b32 %3, a[ 3] \n \ + v_accvgpr_read_b32 %4, a[ 4] \n \ + v_accvgpr_read_b32 %5, a[ 5] \n \ + v_accvgpr_read_b32 %6, a[ 6] \n \ + v_accvgpr_read_b32 %7, a[ 7] \n \ + v_accvgpr_read_b32 %8, a[ 8] \n \ + v_accvgpr_read_b32 %9, a[ 9] \n \ + v_accvgpr_read_b32 %10, a[10] \n \ + v_accvgpr_read_b32 %11, a[11] \n \ + v_accvgpr_read_b32 %12, a[12] \n \ + v_accvgpr_read_b32 %13, a[13] \n \ + v_accvgpr_read_b32 %14, a[14] \n \ + v_accvgpr_read_b32 %15, a[15] \n \ + v_accvgpr_read_b32 %16, a[16] \n \ + v_accvgpr_read_b32 %17, a[17] \n \ + v_accvgpr_read_b32 %18, a[18] \n \ + v_accvgpr_read_b32 %19, a[19] \n \ + v_accvgpr_read_b32 %20, a[20] \n \ + v_accvgpr_read_b32 %21, a[21] \n \ + v_accvgpr_read_b32 %22, a[22] \n \ + v_accvgpr_read_b32 %23, a[23] \n \ + v_accvgpr_read_b32 %24, a[24] \n \ + v_accvgpr_read_b32 %25, a[25] \n \ + v_accvgpr_read_b32 %26, a[26] \n \ + v_accvgpr_read_b32 %27, a[27] \n \ + v_accvgpr_read_b32 %28, a[28] \n \ + v_accvgpr_read_b32 %29, a[29] \n \ + v_accvgpr_read_b32 %30, a[30] \n \ + v_accvgpr_read_b32 %31, a[31] \n \ + " + : + "=v"(arch_reg[ 0]), + "=v"(arch_reg[ 1]), + "=v"(arch_reg[ 2]), + "=v"(arch_reg[ 3]), + "=v"(arch_reg[ 4]), + "=v"(arch_reg[ 5]), + "=v"(arch_reg[ 6]), + "=v"(arch_reg[ 7]), + "=v"(arch_reg[ 8]), + "=v"(arch_reg[ 9]), + "=v"(arch_reg[10]), + "=v"(arch_reg[11]), + "=v"(arch_reg[12]), + "=v"(arch_reg[13]), + "=v"(arch_reg[14]), + "=v"(arch_reg[15]), + "=v"(arch_reg[16]), + "=v"(arch_reg[17]), + "=v"(arch_reg[18]), + "=v"(arch_reg[19]), + "=v"(arch_reg[20]), + "=v"(arch_reg[21]), + "=v"(arch_reg[22]), + "=v"(arch_reg[23]), + "=v"(arch_reg[24]), + "=v"(arch_reg[25]), + "=v"(arch_reg[26]), + "=v"(arch_reg[27]), + "=v"(arch_reg[28]), + "=v"(arch_reg[29]), + "=v"(arch_reg[30]), + "=v"(arch_reg[31]) + :); #else (void)arch_reg; #endif @@ -124,8 +346,138 @@ template <> __device__ void gcnasm_accvgpr_read<64>(float* arch_reg) { #if CK_USE_AMD_XDLOPS_INLINE_ASM - NOP(16) - REPEATx64(ACCVGPR_READ, 0) + asm volatile("\ + v_accvgpr_read_b32 %0, a[ 0] \n \ + v_accvgpr_read_b32 %1, a[ 1] \n \ + v_accvgpr_read_b32 %2, a[ 2] \n \ + v_accvgpr_read_b32 %3, a[ 3] \n \ + v_accvgpr_read_b32 %4, a[ 4] \n \ + v_accvgpr_read_b32 %5, a[ 5] \n \ + v_accvgpr_read_b32 %6, a[ 6] \n \ + v_accvgpr_read_b32 %7, a[ 7] \n \ + v_accvgpr_read_b32 %8, a[ 8] \n \ + v_accvgpr_read_b32 %9, a[ 9] \n \ + v_accvgpr_read_b32 %10, a[10] \n \ + v_accvgpr_read_b32 %11, a[11] \n \ + v_accvgpr_read_b32 %12, a[12] \n \ + v_accvgpr_read_b32 %13, a[13] \n \ + v_accvgpr_read_b32 %14, a[14] \n \ + v_accvgpr_read_b32 %15, a[15] \n \ + v_accvgpr_read_b32 %16, a[16] \n \ + v_accvgpr_read_b32 %17, a[17] \n \ + v_accvgpr_read_b32 %18, a[18] \n \ + v_accvgpr_read_b32 %19, a[19] \n \ + v_accvgpr_read_b32 %20, a[20] \n \ + v_accvgpr_read_b32 %21, a[21] \n \ + v_accvgpr_read_b32 %22, a[22] \n \ + v_accvgpr_read_b32 %23, a[23] \n \ + v_accvgpr_read_b32 %24, a[24] \n \ + v_accvgpr_read_b32 %25, a[25] \n \ + v_accvgpr_read_b32 %26, a[26] \n \ + v_accvgpr_read_b32 %27, a[27] \n \ + v_accvgpr_read_b32 %28, a[28] \n \ + v_accvgpr_read_b32 %29, a[29] \n \ + v_accvgpr_read_b32 %30, a[30] \n \ + v_accvgpr_read_b32 %31, a[31] \n \ + v_accvgpr_read_b32 %32, a[32] \n \ + v_accvgpr_read_b32 %33, a[33] \n \ + v_accvgpr_read_b32 %34, a[34] \n \ + v_accvgpr_read_b32 %35, a[35] \n \ + v_accvgpr_read_b32 %36, a[36] \n \ + v_accvgpr_read_b32 %37, a[37] \n \ + v_accvgpr_read_b32 %38, a[38] \n \ + v_accvgpr_read_b32 %39, a[39] \n \ + v_accvgpr_read_b32 %40, a[40] \n \ + v_accvgpr_read_b32 %41, a[41] \n \ + v_accvgpr_read_b32 %42, a[42] \n \ + v_accvgpr_read_b32 %43, a[43] \n \ + v_accvgpr_read_b32 %44, a[44] \n \ + v_accvgpr_read_b32 %45, a[45] \n \ + v_accvgpr_read_b32 %46, a[46] \n \ + v_accvgpr_read_b32 %47, a[47] \n \ + v_accvgpr_read_b32 %48, a[48] \n \ + v_accvgpr_read_b32 %49, a[49] \n \ + v_accvgpr_read_b32 %50, a[50] \n \ + v_accvgpr_read_b32 %51, a[51] \n \ + v_accvgpr_read_b32 %52, a[52] \n \ + v_accvgpr_read_b32 %53, a[53] \n \ + v_accvgpr_read_b32 %54, a[54] \n \ + v_accvgpr_read_b32 %55, a[55] \n \ + v_accvgpr_read_b32 %56, a[56] \n \ + v_accvgpr_read_b32 %57, a[57] \n \ + v_accvgpr_read_b32 %58, a[58] \n \ + v_accvgpr_read_b32 %59, a[59] \n \ + v_accvgpr_read_b32 %60, a[60] \n \ + v_accvgpr_read_b32 %61, a[61] \n \ + v_accvgpr_read_b32 %62, a[62] \n \ + v_accvgpr_read_b32 %63, a[63] \n \ + " + : + "=v"(arch_reg[ 0]), + "=v"(arch_reg[ 1]), + "=v"(arch_reg[ 2]), + "=v"(arch_reg[ 3]), + "=v"(arch_reg[ 4]), + "=v"(arch_reg[ 5]), + "=v"(arch_reg[ 6]), + "=v"(arch_reg[ 7]), + "=v"(arch_reg[ 8]), + "=v"(arch_reg[ 9]), + "=v"(arch_reg[10]), + "=v"(arch_reg[11]), + "=v"(arch_reg[12]), + "=v"(arch_reg[13]), + "=v"(arch_reg[14]), + "=v"(arch_reg[15]), + "=v"(arch_reg[16]), + "=v"(arch_reg[17]), + "=v"(arch_reg[18]), + "=v"(arch_reg[19]), + "=v"(arch_reg[20]), + "=v"(arch_reg[21]), + "=v"(arch_reg[22]), + "=v"(arch_reg[23]), + "=v"(arch_reg[24]), + "=v"(arch_reg[25]), + "=v"(arch_reg[26]), + "=v"(arch_reg[27]), + "=v"(arch_reg[28]), + "=v"(arch_reg[29]), + "=v"(arch_reg[30]), + "=v"(arch_reg[31]), + "=v"(arch_reg[32]), + "=v"(arch_reg[33]), + "=v"(arch_reg[34]), + "=v"(arch_reg[35]), + "=v"(arch_reg[36]), + "=v"(arch_reg[37]), + "=v"(arch_reg[38]), + "=v"(arch_reg[39]), + "=v"(arch_reg[40]), + "=v"(arch_reg[41]), + "=v"(arch_reg[42]), + "=v"(arch_reg[43]), + "=v"(arch_reg[44]), + "=v"(arch_reg[45]), + "=v"(arch_reg[46]), + "=v"(arch_reg[47]), + "=v"(arch_reg[48]), + "=v"(arch_reg[49]), + "=v"(arch_reg[50]), + "=v"(arch_reg[51]), + "=v"(arch_reg[52]), + "=v"(arch_reg[53]), + "=v"(arch_reg[54]), + "=v"(arch_reg[55]), + "=v"(arch_reg[56]), + "=v"(arch_reg[57]), + "=v"(arch_reg[58]), + "=v"(arch_reg[59]), + "=v"(arch_reg[60]), + "=v"(arch_reg[61]), + "=v"(arch_reg[62]), + "=v"(arch_reg[63]) + :); #else (void)arch_reg; #endif @@ -134,11 +486,31 @@ __device__ void gcnasm_accvgpr_read<64>(float* arch_reg) template __device__ void gcnasm_accvgpr_zero(); +template <> +__device__ void gcnasm_accvgpr_zero<4>() +{ +#if CK_USE_AMD_XDLOPS_INLINE_ASM + REPEATx4(ACCVGPR_ZERO, 0) + S_NOP(1) +#endif +} + +template <> +__device__ void gcnasm_accvgpr_zero<8>() +{ +#if CK_USE_AMD_XDLOPS_INLINE_ASM + REPEATx4(ACCVGPR_ZERO, 0) + REPEATx4(ACCVGPR_ZERO, 4) + S_NOP(1) +#endif +} + template <> __device__ void gcnasm_accvgpr_zero<16>() { #if CK_USE_AMD_XDLOPS_INLINE_ASM REPEATx16(ACCVGPR_ZERO, 0) + S_NOP(1) #endif } @@ -148,6 +520,7 @@ __device__ void gcnasm_accvgpr_zero<32>() #if CK_USE_AMD_XDLOPS_INLINE_ASM REPEATx16(ACCVGPR_ZERO, 0) REPEATx16(ACCVGPR_ZERO, 16) + S_NOP(1) #endif } @@ -156,17 +529,45 @@ __device__ void gcnasm_accvgpr_zero<64>() { #if CK_USE_AMD_XDLOPS_INLINE_ASM REPEATx64(ACCVGPR_ZERO, 0) + S_NOP(1) +#endif +} + +template +__device__ void gcnasm_nop(); + +template <> +__device__ void gcnasm_nop<8>() +{ +#if CK_USE_AMD_XDLOPS_INLINE_ASM + S_NOP(3) +#endif +} + +template <> +__device__ void gcnasm_nop<32>() +{ +#if CK_USE_AMD_XDLOPS_INLINE_ASM + S_NOP(9) +#endif +} + +template <> +__device__ void gcnasm_nop<64>() +{ +#if CK_USE_AMD_XDLOPS_INLINE_ASM + S_NOP(15) + S_NOP(2) #endif } template -__device__ void gcnasm_mfma_f32_32x32x1f32(float&, float&, float32_t*); +__device__ void gcnasm_mfma_f32_32x32x1f32(const float&, const float&, float32_t*); template <> -__device__ void gcnasm_mfma_f32_32x32x1f32<64, 64>(float& reg_a, float& reg_b, float32_t* reg_c) +__device__ void gcnasm_mfma_f32_32x32x1f32<64, 64>(const float& reg_a, const float& reg_b, float32_t* reg_c) { #if CK_USE_AMD_XDLOPS_INLINE_ASM - NOP(1) (void)reg_c; MFMA_F32_32x32x1F32(0, reg_a, reg_b, 1, 0, 0) MFMA_F32_32x32x1F32(32, reg_a, reg_b, 1, 1, 0) @@ -177,10 +578,9 @@ __device__ void gcnasm_mfma_f32_32x32x1f32<64, 64>(float& reg_a, float& reg_b, f } template <> -__device__ void gcnasm_mfma_f32_32x32x1f32<32, 64>(float& reg_a, float& reg_b, float32_t* reg_c) +__device__ void gcnasm_mfma_f32_32x32x1f32<32, 64>(const float& reg_a, const float& reg_b, float32_t* reg_c) { #if CK_USE_AMD_XDLOPS_INLINE_ASM - NOP(1) (void)reg_c; MFMA_F32_32x32x1F32(0, reg_a, reg_b, 1, 0, 0) #else @@ -189,10 +589,9 @@ __device__ void gcnasm_mfma_f32_32x32x1f32<32, 64>(float& reg_a, float& reg_b, f } template <> -__device__ void gcnasm_mfma_f32_32x32x1f32<64, 32>(float& reg_a, float& reg_b, float32_t* reg_c) +__device__ void gcnasm_mfma_f32_32x32x1f32<64, 32>(const float& reg_a, const float& reg_b, float32_t* reg_c) { #if CK_USE_AMD_XDLOPS_INLINE_ASM - NOP(1) (void)reg_c; MFMA_F32_32x32x1F32(0, reg_a, reg_b, 0, 0, 1) #else @@ -200,11 +599,9 @@ __device__ void gcnasm_mfma_f32_32x32x1f32<64, 32>(float& reg_a, float& reg_b, f #endif } - -__device__ void gcnasm_mfma_f32_32x32x2f32(float& reg_a, float& reg_b, float16_t* reg_c) +__device__ void gcnasm_mfma_f32_32x32x2f32(const float& reg_a, const float& reg_b, float16_t* reg_c) { #if CK_USE_AMD_XDLOPS_INLINE_ASM - NOP(1) (void)reg_c; MFMA_F32_32x32x2F32(0, reg_a, reg_b, 0, 0, 0) #else @@ -212,17 +609,86 @@ __device__ void gcnasm_mfma_f32_32x32x2f32(float& reg_a, float& reg_b, float16_t #endif } +__device__ void gcnasm_mfma_f32_16x16x4f32(const float& reg_a, const float& reg_b, float4_t* reg_c) +{ +#if CK_USE_AMD_XDLOPS_INLINE_ASM + (void)reg_c; + MFMA_F32_16x16x4F32(0, reg_a, reg_b, 0, 0, 0) +#else + reg_c[0] = llvm_intrin_amdgcn_mfma_f32_16x16x4f32(reg_a, reg_b, reg_c[0], 0, 0, 0); +#endif +} + +template +__device__ void gcnasm_mfma_f32_16x16x1f32(const float&, const float&, float16_t*); + +template <> +__device__ void gcnasm_mfma_f32_16x16x1f32<16, 64>(const float& reg_a, const float& reg_b, float16_t* reg_c) +{ +#if CK_USE_AMD_XDLOPS_INLINE_ASM + (void)reg_c; + MFMA_F32_16x16x1F32(0, reg_a, reg_b, 2, 0, 0) +#else + reg_c[0] = llvm_intrin_amdgcn_mfma_f32_16x16x1f32(reg_a, reg_b, reg_c[0], 2, 0, 0); +#endif +} + +template <> +__device__ void gcnasm_mfma_f32_16x16x1f32<64, 16>(const float& reg_a, const float& reg_b, float16_t* reg_c) +{ +#if CK_USE_AMD_XDLOPS_INLINE_ASM + (void)reg_c; + MFMA_F32_16x16x1F32(0, reg_a, reg_b, 0, 0, 4) +#else + reg_c[0] = llvm_intrin_amdgcn_mfma_f32_16x16x1f32(reg_a, reg_b, reg_c[0], 0, 0, 4); +#endif +} + +template +__device__ void gcnasm_mfma_f32_4x4x1f32(const float* a, const float* b, float4_t* reg_c); + +template <> +__device__ void gcnasm_mfma_f32_4x4x1f32<4, 64>(const float* a, const float* b, float4_t* reg_c) +{ + const float reg_a = *a; + const float reg_b = *b; + +#if CK_USE_AMD_XDLOPS_INLINE_ASM + (void)reg_c; + MFMA_F32_4x4x1F32(0, reg_a, reg_b, 4, 0, 0) +#else + reg_c[0] = llvm_intrin_amdgcn_mfma_f32_4x4x1f32(reg_a, reg_b, reg_c[0], 4, 0, 0); +#endif +} + +template <> +__device__ void gcnasm_mfma_f32_4x4x1f32<8, 64>(const float* a, const float* b, float4_t* reg_c) +{ + const float reg_a_0 = *a; + const float reg_b_0 = *b; + const float reg_a_1 = *(a + 4); + const float reg_b_1 = reg_b_0; + +#if CK_USE_AMD_XDLOPS_INLINE_ASM + (void)reg_c; + MFMA_F32_4x4x1F32(0, reg_a_0, reg_b_0, 4, 0, 0) + MFMA_F32_4x4x1F32(4, reg_a_1, reg_b_1, 4, 0, 0) +#else + reg_c[0] = llvm_intrin_amdgcn_mfma_f32_4x4x1f32(reg_a_0, reg_b_0, reg_c[0], 4, 0, 0); + reg_c[1] = llvm_intrin_amdgcn_mfma_f32_4x4x1f32(reg_a_1, reg_b_1, reg_c[1], 4, 0, 0); +#endif +} + template -__device__ void gcnasm_mfma_f32_32x32x4f16(typename vector_type::MemoryType&, - typename vector_type::MemoryType&, +__device__ void gcnasm_mfma_f32_32x32x4f16(const half4_t&, + const half4_t&, float32_t*); template <> -__device__ void gcnasm_mfma_f32_32x32x4f16<64, 64>(typename vector_type::MemoryType& reg_a, - typename vector_type::MemoryType& reg_b, +__device__ void gcnasm_mfma_f32_32x32x4f16<64, 64>(const half4_t& reg_a, + const half4_t& reg_b, float32_t* reg_c) { #if CK_USE_AMD_XDLOPS_INLINE_ASM - NOP(1) (void)reg_c; MFMA_F32_32x32x4F16(0, reg_a, reg_b, 1, 0, 0) MFMA_F32_32x32x4F16(32, reg_a, reg_b, 1, 1, 0) @@ -233,12 +699,11 @@ __device__ void gcnasm_mfma_f32_32x32x4f16<64, 64>(typename vector_type } template <> -__device__ void gcnasm_mfma_f32_32x32x4f16<32, 64>(typename vector_type::MemoryType& reg_a, - typename vector_type::MemoryType& reg_b, +__device__ void gcnasm_mfma_f32_32x32x4f16<32, 64>(const half4_t& reg_a, + const half4_t& reg_b, float32_t* reg_c) { #if CK_USE_AMD_XDLOPS_INLINE_ASM - NOP(1) (void)reg_c; MFMA_F32_32x32x4F16(0, reg_a, reg_b, 1, 0, 0) #else @@ -247,12 +712,11 @@ __device__ void gcnasm_mfma_f32_32x32x4f16<32, 64>(typename vector_type } template <> -__device__ void gcnasm_mfma_f32_32x32x4f16<64, 32>(typename vector_type::MemoryType& reg_a, - typename vector_type::MemoryType& reg_b, +__device__ void gcnasm_mfma_f32_32x32x4f16<64, 32>(const half4_t& reg_a, + const half4_t& reg_b, float32_t* reg_c) { #if CK_USE_AMD_XDLOPS_INLINE_ASM - NOP(1) (void)reg_c; MFMA_F32_32x32x4F16(0, reg_a, reg_b, 0, 0, 1) #else @@ -260,12 +724,11 @@ __device__ void gcnasm_mfma_f32_32x32x4f16<64, 32>(typename vector_type #endif } -__device__ void gcnasm_mfma_f32_32x32x8f16(typename vector_type::MemoryType& reg_a, - typename vector_type::MemoryType& reg_b, +__device__ void gcnasm_mfma_f32_32x32x8f16(const half4_t& reg_a, + const half4_t& reg_b, float16_t* reg_c) { #if CK_USE_AMD_XDLOPS_INLINE_ASM - NOP(1) (void)reg_c; MFMA_F32_32x32x8F16(0, reg_a, reg_b, 0, 0, 0) #else @@ -273,18 +736,102 @@ __device__ void gcnasm_mfma_f32_32x32x8f16(typename vector_type::Memory #endif } +__device__ void gcnasm_mfma_f32_16x16x16f16(const half4_t& reg_a, + const half4_t& reg_b, + float4_t* reg_c) +{ +#if CK_USE_AMD_XDLOPS_INLINE_ASM + (void)reg_c; + MFMA_F32_16x16x16F16(0, reg_a, reg_b, 0, 0, 0) +#else + reg_c[0] = llvm_intrin_amdgcn_mfma_f32_16x16x16f16(reg_a, reg_b, reg_c[0], 0, 0, 0); +#endif +} + +template +__device__ void gcnasm_mfma_f32_16x16x4f16(const half4_t& reg_a, + const half4_t& reg_b, + float16_t* reg_c); + +template <> +__device__ void gcnasm_mfma_f32_16x16x4f16<16, 64>(const half4_t& reg_a, + const half4_t& reg_b, + float16_t* reg_c) +{ +#if CK_USE_AMD_XDLOPS_INLINE_ASM + (void)reg_c; + MFMA_F32_16x16x4F16(0, reg_a, reg_b, 2, 0, 0) +#else + reg_c[0] = llvm_intrin_amdgcn_mfma_f32_16x16x4f16(reg_a, reg_b, reg_c[0], 2, 0, 0); +#endif +} + +template <> +__device__ void gcnasm_mfma_f32_16x16x4f16<64, 16>(const half4_t& reg_a, + const half4_t& reg_b, + float16_t* reg_c) + +{ +#if CK_USE_AMD_XDLOPS_INLINE_ASM + (void)reg_c; + MFMA_F32_16x16x4F16(0, reg_a, reg_b, 0, 0, 4) +#else + reg_c[0] = llvm_intrin_amdgcn_mfma_f32_16x16x4f16(reg_a, reg_b, reg_c[0], 0, 0, 4); +#endif +} + +template +__device__ void gcnasm_mfma_f32_4x4x4f16(const half4_t *a, + const half4_t* b, + float4_t* reg_c); + +template <> +__device__ void gcnasm_mfma_f32_4x4x4f16<4, 64>(const half4_t *a, + const half4_t* b, + float4_t* reg_c) +{ + const half4_t reg_a = *a; + const half4_t reg_b = *b; + +#if CK_USE_AMD_XDLOPS_INLINE_ASM + (void)reg_c; + MFMA_F32_4x4x4F16(0, reg_a, reg_b, 4, 0, 0) +#else + reg_c[0] = llvm_intrin_amdgcn_mfma_f32_4x4x4f16(reg_a, reg_b, reg_c[0], 4, 0, 0); +#endif +} + +template <> +__device__ void gcnasm_mfma_f32_4x4x4f16<8, 64>(const half4_t *a, + const half4_t* b, + float4_t* reg_c) +{ + const half4_t reg_a_0 = *a; + const half4_t reg_b_0 = *b; + const half4_t reg_a_1 = *(a + 4); + const half4_t reg_b_1 = reg_b_0; + +#if CK_USE_AMD_XDLOPS_INLINE_ASM + (void)reg_c; + MFMA_F32_4x4x4F16(0, reg_a_0, reg_b_0, 4, 0, 0) + MFMA_F32_4x4x4F16(4, reg_a_1, reg_b_1, 4, 0, 0) +#else + reg_c[0] = llvm_intrin_amdgcn_mfma_f32_4x4x4f16(reg_a_0, reg_b_0, reg_c[0], 4, 0, 0); + reg_c[1] = llvm_intrin_amdgcn_mfma_f32_4x4x4f16(reg_a_1, reg_b_1, reg_c[1], 4, 0, 0); +#endif +} + template -__device__ void gcnasm_mfma_f32_32x32x2bf16(typename vector_type::MemoryType&, - typename vector_type::MemoryType&, +__device__ void gcnasm_mfma_f32_32x32x2bf16(const ushort2_t&, + const ushort2_t&, float32_t*); template <> -__device__ void gcnasm_mfma_f32_32x32x2bf16<64, 64>(typename vector_type::MemoryType& reg_a, - typename vector_type::MemoryType& reg_b, +__device__ void gcnasm_mfma_f32_32x32x2bf16<64, 64>(const ushort2_t& reg_a, + const ushort2_t& reg_b, float32_t* reg_c) { #if CK_USE_AMD_XDLOPS_INLINE_ASM - NOP(1) (void)reg_c; MFMA_F32_32x32x2BF16(0, reg_a, reg_b, 1, 0, 0) MFMA_F32_32x32x2BF16(32, reg_a, reg_b, 1, 1, 0) @@ -295,12 +842,11 @@ __device__ void gcnasm_mfma_f32_32x32x2bf16<64, 64>(typename vector_type -__device__ void gcnasm_mfma_f32_32x32x2bf16<32, 64>(typename vector_type::MemoryType& reg_a, - typename vector_type::MemoryType& reg_b, +__device__ void gcnasm_mfma_f32_32x32x2bf16<32, 64>(const ushort2_t& reg_a, + const ushort2_t& reg_b, float32_t* reg_c) { #if CK_USE_AMD_XDLOPS_INLINE_ASM - NOP(1) (void)reg_c; MFMA_F32_32x32x2BF16(0, reg_a, reg_b, 1, 0, 0) #else @@ -309,12 +855,11 @@ __device__ void gcnasm_mfma_f32_32x32x2bf16<32, 64>(typename vector_type -__device__ void gcnasm_mfma_f32_32x32x2bf16<64, 32>(typename vector_type::MemoryType& reg_a, - typename vector_type::MemoryType& reg_b, +__device__ void gcnasm_mfma_f32_32x32x2bf16<64, 32>(const ushort2_t& reg_a, + const ushort2_t& reg_b, float32_t* reg_c) { #if CK_USE_AMD_XDLOPS_INLINE_ASM - NOP(1) (void)reg_c; MFMA_F32_32x32x2BF16(0, reg_a, reg_b, 0, 0, 1) #else @@ -322,18 +867,101 @@ __device__ void gcnasm_mfma_f32_32x32x2bf16<64, 32>(typename vector_type::MemoryType& reg_a, - typename vector_type::MemoryType& reg_b, +__device__ void gcnasm_mfma_f32_32x32x4bf16(const ushort2_t& reg_a, + const ushort2_t& reg_b, float16_t* reg_c) { #if CK_USE_AMD_XDLOPS_INLINE_ASM - NOP(1) (void)reg_c; MFMA_F32_32x32x4BF16(0, reg_a, reg_b, 0, 0, 0) #else reg_c[0] = llvm_intrin_amdgcn_mfma_f32_32x32x4bf16(reg_a, reg_b, reg_c[0], 0, 0, 0); #endif } + +__device__ void gcnasm_mfma_f32_16x16x8bf16(const ushort2_t& reg_a, + const ushort2_t& reg_b, + float4_t* reg_c) +{ +#if CK_USE_AMD_XDLOPS_INLINE_ASM + (void)reg_c; + MFMA_F32_16x16x8BF16(0, reg_a, reg_b, 0, 0, 0) +#else + reg_c[0] = llvm_intrin_amdgcn_mfma_f32_16x16x8bf16(reg_a, reg_b, reg_c[0], 0, 0, 0); +#endif +} + +template +__device__ void gcnasm_mfma_f32_16x16x2bf16(const ushort2_t& reg_a, + const ushort2_t& reg_b, + float16_t* reg_c); + +template <> +__device__ void gcnasm_mfma_f32_16x16x2bf16<16, 64>(const ushort2_t& reg_a, + const ushort2_t& reg_b, + float16_t* reg_c) +{ +#if CK_USE_AMD_XDLOPS_INLINE_ASM + (void)reg_c; + MFMA_F32_16x16x2BF16(0, reg_a, reg_b, 2, 0, 0) +#else + reg_c[0] = llvm_intrin_amdgcn_mfma_f32_16x16x2bf16(reg_a, reg_b, reg_c[0], 2, 0, 0); +#endif +} + +template <> +__device__ void gcnasm_mfma_f32_16x16x2bf16<64, 16>(const ushort2_t& reg_a, + const ushort2_t& reg_b, + float16_t* reg_c) +{ +#if CK_USE_AMD_XDLOPS_INLINE_ASM + (void)reg_c; + MFMA_F32_16x16x2BF16(0, reg_a, reg_b, 0, 0, 4) +#else + reg_c[0] = llvm_intrin_amdgcn_mfma_f32_16x16x2bf16(reg_a, reg_b, reg_c[0], 0, 0, 4); +#endif +} + +template +__device__ void gcnasm_mfma_f32_4x4x2bf16(const ushort2_t *a, + const ushort2_t *b, + float4_t* reg_c); + +template <> +__device__ void gcnasm_mfma_f32_4x4x2bf16<4, 64>(const ushort2_t *a, + const ushort2_t *b, + float4_t* reg_c) +{ + const ushort2_t reg_a = *a; + const ushort2_t reg_b = *b; + +#if CK_USE_AMD_XDLOPS_INLINE_ASM + (void)reg_c; + MFMA_F32_4x4x2BF16(0, reg_a, reg_b, 4, 0, 0) +#else + reg_c[0] = llvm_intrin_amdgcn_mfma_f32_4x4x2bf16(reg_a, reg_b, reg_c[0], 4, 0, 0); +#endif +} + +template <> +__device__ void gcnasm_mfma_f32_4x4x2bf16<8, 64>(const ushort2_t *a, + const ushort2_t *b, + float4_t* reg_c) +{ + const ushort2_t reg_a_0 = *a; + const ushort2_t reg_b_0 = *b; + const ushort2_t reg_a_1 = *(a + 4); + const ushort2_t reg_b_1 = reg_b_0; + +#if CK_USE_AMD_XDLOPS_INLINE_ASM + (void)reg_c; + MFMA_F32_4x4x2BF16(0, reg_a_0, reg_b_0, 4, 0, 0) + MFMA_F32_4x4x2BF16(4, reg_a_1, reg_b_1, 4, 0, 0) +#else + reg_c[0] = llvm_intrin_amdgcn_mfma_f32_4x4x2bf16(reg_a_0, reg_b_0, reg_c[0], 4, 0, 0); + reg_c[1] = llvm_intrin_amdgcn_mfma_f32_4x4x2bf16(reg_a_1, reg_b_1, reg_c[1], 4, 0, 0); +#endif +} // clang-format on } #endif From d1d618c64ea208bb2a9ed87eb73dca5babd8fd0c Mon Sep 17 00:00:00 2001 From: Jing Zhou Date: Thu, 2 Jan 2020 19:36:49 -0800 Subject: [PATCH 03/15] workaround: skip opencl dropout test for Issue #2335 (#2336) --- test/dropout.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/dropout.cpp b/test/dropout.cpp index eff5350908..3d2b8afbd1 100644 --- a/test/dropout.cpp +++ b/test/dropout.cpp @@ -549,6 +549,13 @@ struct dropout_driver : test_driver void run() { +// Workaround for issue #2335. +// OpenCL error creating buffer: 0 Invalid Buffer Size +#if MIOPEN_BACKEND_OPENCL + std::cout << "Skip test for Issue #2335: " << std::endl; + return; +#endif + miopen::DropoutDescriptor DropoutDesc; unsigned long max_value = miopen_type{} == miopenHalf ? 5 : 17; auto&& handle = get_handle(); From 5e0532abf148efe98d775c58b5c20342d0b8d416 Mon Sep 17 00:00:00 2001 From: Jing Zhou Date: Fri, 3 Jan 2020 20:11:17 -0800 Subject: [PATCH 04/15] Fix issues in lrn (#2305) * correct lrn radius in ctest * refactor lrn driver cpu verification * fix lrn forward path bugs * fix lrn backward path bugs * fix lrn ctest * add is_packed check for lrn forward path * clang tidy * add lrn kernel size check * revert is packed check * remove empty lines * use throw check --- driver/mloNormHost.hpp | 80 ++++++++------ src/kernels/MIOpenLRNBwd.cl | 19 ++-- src/kernels/MIOpenLRNFwd.cl | 14 +-- src/ocl/lrn_ocl.cpp | 18 ++-- src/ocl/mloNorm.cpp | 34 +++++- test/lrn_test.cpp | 209 ++++++++++++++++++------------------ 6 files changed, 212 insertions(+), 162 deletions(-) diff --git a/driver/mloNormHost.hpp b/driver/mloNormHost.hpp index f9dbcf5f10..34cd4b149a 100644 --- a/driver/mloNormHost.hpp +++ b/driver/mloNormHost.hpp @@ -71,6 +71,12 @@ int mloLRNForwardRunHost(bool do_scale, { int ret = 0; + if(local_area < 1 + pad) + { + std::cout << "ERROR: Lrn kernel size is insufficient." << std::endl; + return -1; + } + if(norm_region == MLO_LRN_ACROSS_CHANNELS) { for(int b = 0; b < n_batchs; b++) @@ -94,7 +100,7 @@ int mloLRNForwardRunHost(bool do_scale, ++head; } // until we reach size, nothing needs to be subtracted - while(head < local_area && head - pad >= 0 && head < n_inputs) + while(head < local_area) { bot_val = (head < n_inputs) ? static_cast<_Tcheck>( @@ -103,21 +109,21 @@ int mloLRNForwardRunHost(bool do_scale, : static_cast<_Tcheck>(0); accum_scale += bot_val * bot_val; _Tcheck scale = K + accum_scale * alphaoverarea; - if((head - pad) >= 0 && do_scale) + if((head - pad) >= 0 && (head - pad) < n_outputs && do_scale) { scale_v_ptr[b * scale_v_batch_stride + (head - pad) * scale_v_channel_stride + j * scale_v_stride + i] = scale; } bot_val = - ((head - pad) >= 0) + ((head - pad) >= 0 && (head - pad) < n_inputs) ? static_cast<_Tcheck>(bot_ptr[b * bot_batch_stride + (head - pad) * bot_channel_stride + j * bot_stride + i]) : static_cast<_Tcheck>(0); _Tcheck s = pow(scale, -beta); _Tcheck c_val = bot_val * s; - if((head - pad) >= 0) + if((head - pad) >= 0 && (head - pad) < n_outputs) { top_v_ptr[b * top_v_batch_stride + (head - pad) * top_v_channel_stride + j * top_v_stride + i] = c_val; @@ -209,8 +215,8 @@ int mloLRNForwardRunHost(bool do_scale, { // c-emulator _Tcheck scale = static_cast<_Tcheck>(0); - int hstart = j - pad; - int wstart = i - pad; + int hstart = j - (local_area - 1 - pad); + int wstart = i - (local_area - 1 - pad); int hend = std::min(hstart + local_area, bot_height + pad); int wend = std::min(wstart + local_area, bot_width + pad); int adj_area_size = (hend - hstart) * (wend - wstart); @@ -297,6 +303,12 @@ int mloLRNBackwardRunHost(int norm_region, int ret = 0; _Tcheck negative_beta = -beta; + int pre_pad = local_area - 1 - pad; + if(pre_pad < 0) + { + std::cout << "ERROR: Lrn kernel size is insufficient." << std::endl; + return -1; + } if(norm_region == MLO_LRN_ACROSS_CHANNELS) { @@ -316,7 +328,7 @@ int mloLRNBackwardRunHost(int norm_region, _Tcheck accum_ratio = static_cast<_Tcheck>(0); // accumulate values - while(head < pad) + while(head < pre_pad) { if(head < n_inputs) { @@ -357,24 +369,24 @@ int mloLRNBackwardRunHost(int norm_region, accum_ratio += adder; } - if(head - pad >= 0 && head - pad < n_inputs) + if(head - pre_pad >= 0 && head - pre_pad < n_inputs) { bot_df_v_ptr[b * bot_df_v_batch_stride + - (head - pad) * bot_df_v_channel_stride + + (head - pre_pad) * bot_df_v_channel_stride + j * bot_df_v_stride + i] = static_cast<_Tcheck>( top_df_ptr[b * top_df_batch_stride + - (head - pad) * top_df_channel_stride + + (head - pre_pad) * top_df_channel_stride + j * top_df_stride + i]) * pow(static_cast<_Tcheck>( scale_ptr[b * scale_batch_stride + - (head - pad) * scale_channel_stride + + (head - pre_pad) * scale_channel_stride + j * scale_stride + i]), negative_beta) - - ratio_dta_bwd * - static_cast<_Tcheck>(bot_ptr[b * bot_batch_stride + - (head - pad) * bot_channel_stride + - j * bot_stride + i]) * + ratio_dta_bwd * static_cast<_Tcheck>( + bot_ptr[b * bot_batch_stride + + (head - pre_pad) * bot_channel_stride + + j * bot_stride + i]) * accum_ratio; } ++head; @@ -415,31 +427,31 @@ int mloLRNBackwardRunHost(int norm_region, accum_ratio -= subs; } - if(head - pad >= 0) + if(head - pre_pad >= 0) { bot_df_v_ptr[b * bot_df_v_batch_stride + - (head - pad) * bot_df_v_channel_stride + + (head - pre_pad) * bot_df_v_channel_stride + j * bot_df_v_stride + i] = static_cast<_Tcheck>( top_df_ptr[b * top_df_batch_stride + - (head - pad) * top_df_channel_stride + + (head - pre_pad) * top_df_channel_stride + j * top_df_stride + i]) * pow(static_cast<_Tcheck>( scale_ptr[b * scale_batch_stride + - (head - pad) * scale_channel_stride + + (head - pre_pad) * scale_channel_stride + j * scale_stride + i]), negative_beta) - - ratio_dta_bwd * - static_cast<_Tcheck>(bot_ptr[b * bot_batch_stride + - (head - pad) * bot_channel_stride + - j * bot_stride + i]) * + ratio_dta_bwd * static_cast<_Tcheck>( + bot_ptr[b * bot_batch_stride + + (head - pre_pad) * bot_channel_stride + + j * bot_stride + i]) * accum_ratio; } ++head; } // subtract only - while(head < n_inputs + pad) + while(head < n_inputs + pre_pad) { if(head - local_area >= 0 && head - local_area < n_inputs) { @@ -459,24 +471,24 @@ int mloLRNBackwardRunHost(int norm_region, accum_ratio -= subs; } - if(head - pad >= 0 && head - pad < n_inputs) + if(head - pre_pad >= 0 && head - pre_pad < n_inputs) { bot_df_v_ptr[b * bot_df_v_batch_stride + - (head - pad) * bot_df_v_channel_stride + + (head - pre_pad) * bot_df_v_channel_stride + j * bot_df_v_stride + i] = static_cast<_Tcheck>( top_df_ptr[b * top_df_batch_stride + - (head - pad) * top_df_channel_stride + + (head - pre_pad) * top_df_channel_stride + j * top_df_stride + i]) * pow(static_cast<_Tcheck>( scale_ptr[b * scale_batch_stride + - (head - pad) * scale_channel_stride + + (head - pre_pad) * scale_channel_stride + j * scale_stride + i]), negative_beta) - - ratio_dta_bwd * - static_cast<_Tcheck>(bot_ptr[b * bot_batch_stride + - (head - pad) * bot_channel_stride + - j * bot_stride + i]) * + ratio_dta_bwd * static_cast<_Tcheck>( + bot_ptr[b * bot_batch_stride + + (head - pre_pad) * bot_channel_stride + + j * bot_stride + i]) * accum_ratio; } @@ -502,8 +514,8 @@ int mloLRNBackwardRunHost(int norm_region, int hstart = j - pad; int wstart = i - pad; - int hend = std::min(hstart + local_area, top_height + pad); - int wend = std::min(wstart + local_area, top_width + pad); + int hend = std::min(hstart + local_area, top_height + pre_pad); + int wend = std::min(wstart + local_area, top_width + pre_pad); int adj_area_size = (hend - hstart) * (wend - wstart); hstart = std::max(hstart, 0); wstart = std::max(wstart, 0); diff --git a/src/kernels/MIOpenLRNBwd.cl b/src/kernels/MIOpenLRNBwd.cl index 9bdb6f7529..36c56a7e50 100644 --- a/src/kernels/MIOpenLRNBwd.cl +++ b/src/kernels/MIOpenLRNBwd.cl @@ -215,7 +215,7 @@ MIOpenLRNWithinChannelBwd(const __global _FLOAT* top, { int v_off_v = (lcl_id1 * MLO_LRN_N_VERT_OUT_PIX + j); int hstart = y + v_off_v - MLO_LRN_PAD; - int hend = min(hstart + MLO_LRN_KERNEL_SZ, MLO_LRN_TOP_HEIGHT + MLO_LRN_PAD); + int hend = min(hstart + MLO_LRN_KERNEL_SZ, MLO_LRN_TOP_HEIGHT + MLO_LRN_PRE_PAD); // accum offset, vertical // int lcl_a_off_v = v_off_v * MLO_LRN_LCL_DATA_WIDTH; @@ -227,7 +227,7 @@ MIOpenLRNWithinChannelBwd(const __global _FLOAT* top, int v_off_h = lcl_id0 * MLO_LRN_N_HORIZ_OUT_PIX + i; int wstart = x + v_off_h - MLO_LRN_PAD; - int wend = min(wstart + MLO_LRN_KERNEL_SZ, MLO_LRN_TOP_WIDTH + MLO_LRN_PAD); + int wend = min(wstart + MLO_LRN_KERNEL_SZ, MLO_LRN_TOP_WIDTH + MLO_LRN_PRE_PAD); int adj_area_size = (hend - hstart) * (wend - wstart); @@ -250,7 +250,8 @@ MIOpenLRNWithinChannelBwd(const __global _FLOAT* top, uint bot_off0 = MLO_LRN_BOT_BATCH_STRIDE * b + MLO_LRN_BOT_CHANNEL_STRIDE * o + MLO_LRN_BOT_STRIDE * (y + v_off_v) + x + v_off_h; - uint bot_off = (bot_off0 < MLO_LRN_BATCH_SZ * MLO_LRN_BOT_BATCH_STRIDE) + uint bot_off = (y + v_off_v < MLO_LRN_BOT_HEIGHT && x + v_off_h < MLO_LRN_BOT_WIDTH && + b < MLO_LRN_BATCH_SZ && o < MLO_LRN_N_OUTPUTS) ? bot_off0 : MLO_LRN_BATCH_SZ * MLO_LRN_BOT_BATCH_STRIDE - 1; #if DBG_RANGE @@ -262,7 +263,10 @@ MIOpenLRNWithinChannelBwd(const __global _FLOAT* top, #endif _FLOAT bot_dta = bot[bot_off]; - bot_dta = (bot_off0 < MLO_LRN_BATCH_SZ * MLO_LRN_BOT_BATCH_STRIDE) ? bot_dta : 0; + bot_dta = (y + v_off_v < MLO_LRN_BOT_HEIGHT && x + v_off_h < MLO_LRN_BOT_WIDTH && + b < MLO_LRN_BATCH_SZ && o < MLO_LRN_N_OUTPUTS) + ? bot_dta + : 0; _FLOAT adj_ratio = (_FLOAT)2.f * alpha * beta / adj_area_size; _FLOAT prv_accum_ratio = adj_ratio * bot_dta * prv_ratio_accum; @@ -274,7 +278,8 @@ MIOpenLRNWithinChannelBwd(const __global _FLOAT* top, { for(int i = 0; i < MLO_LRN_N_HORIZ_OUT_PIX; i++) { - if(bot_y + j < MLO_LRN_BOT_HEIGHT && bot_x + i < MLO_LRN_BOT_WIDTH) + if(bot_y + j < MLO_LRN_BOT_HEIGHT && bot_x + i < MLO_LRN_BOT_WIDTH && + b < MLO_LRN_BATCH_SZ && o < MLO_LRN_N_OUTPUTS) { #if DBG_RANGE @@ -319,7 +324,7 @@ MIOpenLRNAcrossChannelsBwd1(const __global _FLOAT* top, int c_i = 0, c_o = 0; int bot_df_off = 0; - for(c_i = 0; c_i < MLO_LRN_PAD; c_i++) + for(c_i = 0; c_i < MLO_LRN_PRE_PAD; c_i++) { top_df_in[c_i] = top_df[MLO_LRN_TOPDF_BATCH_STRIDE * b + @@ -429,7 +434,7 @@ MIOpenLRNAcrossChannelsBwd1(const __global _FLOAT* top, } } - for(; c_i < MLO_LRN_N_INPUTS + MLO_LRN_PAD; c_i++, c_o++) + for(; c_i < MLO_LRN_N_INPUTS + MLO_LRN_PRE_PAD; c_i++, c_o++) { accum_ratio = accum_ratio - ratio_dta[0]; diff --git a/src/kernels/MIOpenLRNFwd.cl b/src/kernels/MIOpenLRNFwd.cl index 4ea81c339a..6a374055ba 100644 --- a/src/kernels/MIOpenLRNFwd.cl +++ b/src/kernels/MIOpenLRNFwd.cl @@ -50,7 +50,7 @@ #define MLO_LRN_GROUP_SZ2 1 #define MLO_LRN_STRIDE 1 -#define MLO_LRN_LEFT_PAD0 (((MLO_LRN_PAD0 + MLO_READ_UNIT - 1) / MLO_READ_UNIT) * MLO_READ_UNIT) +#define MLO_LRN_LEFT_PAD0 (((MLO_LRN_PRE_PAD0 + MLO_READ_UNIT - 1) / MLO_READ_UNIT) * MLO_READ_UNIT) #define MLO_LRN_RIGHT_SIDE \ (((MLO_LRN_GROUP_SZ0 * MLO_LRN_N_HORIZ_OUT_PIX + MLO_LRN_PAD0 + MLO_READ_UNIT - 1) / \ MLO_READ_UNIT) * \ @@ -98,7 +98,7 @@ MIOpenLRNWithinChannel_PS(const __global _FLOAT* bot, // load tile for(int b_j = lcl_id1; b_j < MLO_LRN_LCL_DATA_HEIGHT; b_j += MLO_LRN_GROUP_SZ1) { - int bot_y_act = bot_y + b_j - MLO_LRN_PAD1; + int bot_y_act = bot_y + b_j - MLO_LRN_PRE_PAD1; bool invisibleY = (bot_y_act < 0) || (bot_y_act >= MLO_LRN_BOT_HEIGHT); @@ -145,7 +145,7 @@ MIOpenLRNWithinChannel_PS(const __global _FLOAT* bot, int lcl_y = mul24(lcl_id1, (int)MLO_LRN_N_VERT_OUT_PIX); int lcl_x = - mad24(lcl_id0, (int)(MLO_LRN_N_HORIZ_OUT_PIX), (int)(MLO_LRN_LEFT_PAD0 - MLO_LRN_PAD0)); + mad24(lcl_id0, (int)(MLO_LRN_N_HORIZ_OUT_PIX), (int)(MLO_LRN_LEFT_PAD0 - MLO_LRN_PRE_PAD0)); int lcl_off = mad24(lcl_y, MLO_LRN_LCL_DATA_WIDTH, lcl_x); for(int j = 0; j < MLO_LRN_N_VERT_OUT_PIX; ++j) @@ -382,9 +382,9 @@ MIOpenLRNWithinChannel_PS(const __global _FLOAT* bot, _FLOAT s; s = exp((_FLOAT)-beta * log(prv_scale[k][l])); // s = pow(prv_scale[k][l], -beta); - _FLOAT bot_val = bot_data[lcl_off + mad24((k + MLO_LRN_PAD1), + _FLOAT bot_val = bot_data[lcl_off + mad24((k + MLO_LRN_PRE_PAD1), (int)MLO_LRN_LCL_DATA_WIDTH, - (l + MLO_LRN_PAD0))]; + (l + MLO_LRN_PRE_PAD0))]; #if MLO_LRN_DO_SCALE scale[scale_off + k * MLO_LRN_SCALE_STRIDE + l] = prv_scale[k][l]; #endif @@ -597,7 +597,7 @@ MIOpenLRNAcrossChannels4(const __global _FLOAT* bottom, // pow(prv_scale,-beta); // bug // MLO_READ_TYPE prv_out = sqrt(bot_in2[MLO_LRN_PAD]); - MLO_READ_TYPE prv_out = bot_in2[MLO_LRN_PAD]; + MLO_READ_TYPE prv_out = bot_in2[MLO_LRN_PRE_PAD]; prv_out = sqrt(prv_out); MLO_READ_TYPE out_val = prv_out * exp_scale; @@ -661,7 +661,7 @@ MIOpenLRNAcrossChannels4(const __global _FLOAT* bottom, // pow(prv_scale,-beta); // bug // MLO_READ_TYPE prv_out = sqrt(bot_in2[MLO_LRN_PAD]); - MLO_READ_TYPE prv_out = bot_in2[MLO_LRN_PAD]; + MLO_READ_TYPE prv_out = bot_in2[MLO_LRN_PRE_PAD]; prv_out = sqrt(prv_out); MLO_READ_TYPE out_val = prv_out * exp_scale; diff --git a/src/ocl/lrn_ocl.cpp b/src/ocl/lrn_ocl.cpp index 83736e9a9b..eb56cb7139 100644 --- a/src/ocl/lrn_ocl.cpp +++ b/src/ocl/lrn_ocl.cpp @@ -40,6 +40,8 @@ miopenStatus_t LRNDescriptor::Forward(Handle& handle, bool do_backward, Data_t workSpace) const { + if(!(xDesc.IsPacked() && yDesc.IsPacked())) + MIOPEN_THROW("Only support packed tensors"); miopenStatus_t status = miopenStatusSuccess; mlo_construct_norm construct_params(1); // forward @@ -109,10 +111,10 @@ miopenStatus_t LRNDescriptor::Forward(Handle& handle, std::to_string(f_norm_alphaoverarea) + std::to_string(local_ar) + std::to_string(norm_region) + std::to_string(static_cast(do_backward)) + std::to_string(xDesc.GetType()) + std::to_string(nInStride) + std::to_string(nOutStride) + - std::to_string(nIn) + std::to_string(nOut) + std::to_string(nInStride) + - std::to_string(nOutStride) + std::to_string(cIn) + std::to_string(cOut) + - std::to_string(cInStride) + std::to_string(cOutStride) + std::to_string(hIn) + - std::to_string(hOut); + std::to_string(nIn) + std::to_string(nOut) + std::to_string(cInStride) + + std::to_string(cOutStride) + std::to_string(cIn) + std::to_string(cOut) + + std::to_string(hInStride) + std::to_string(hOutStride) + std::to_string(hIn) + + std::to_string(hOut) + std::to_string(wIn) + std::to_string(wOut); auto&& kernels = handle.GetKernels(algo_name, network_config); if(!kernels.empty()) @@ -285,10 +287,10 @@ miopenStatus_t LRNDescriptor::Backward(Handle& handle, std::to_string(norm_alphaoverarea) + std::to_string(local_ar) + std::to_string(norm_region) + std::to_string(f_norm_ratio) + std::to_string(xDesc.GetType()) + std::to_string(nInStride) + std::to_string(nOutStride) + - std::to_string(nIn) + std::to_string(nOut) + std::to_string(nInStride) + - std::to_string(nOutStride) + std::to_string(cIn) + std::to_string(cOut) + - std::to_string(cInStride) + std::to_string(cOutStride) + std::to_string(hIn) + - std::to_string(hOut); + std::to_string(nIn) + std::to_string(nOut) + std::to_string(cInStride) + + std::to_string(cOutStride) + std::to_string(cIn) + std::to_string(cOut) + + std::to_string(hInStride) + std::to_string(hOutStride) + std::to_string(hIn) + + std::to_string(hOut) + std::to_string(wIn) + std::to_string(wOut); auto&& kernels = handle.GetKernels(algo_name, network_config); if(!kernels.empty()) diff --git a/src/ocl/mloNorm.cpp b/src/ocl/mloNorm.cpp index cc4dd3d5d6..5a8a7be4a2 100644 --- a/src/ocl/mloNorm.cpp +++ b/src/ocl/mloNorm.cpp @@ -69,6 +69,11 @@ void mlo_construct_norm::mloConstruct() } } +inline bool is_tensor_packed(int c, int h, int w, int b_str, int c_str, int h_str) +{ + return h_str == w && c_str == h * h_str && b_str == c * c_str; +} + int mlo_construct_norm::mloConstructFwd() { int ret = 0; @@ -80,6 +85,9 @@ int mlo_construct_norm::mloConstructFwd() int pre_pad = (_norm_area - 1) / 2; int pad = _norm_area - pre_pad - 1; + if(pre_pad < 0 || pad < 0) + MIOPEN_THROW("Wrong LRN kernel size"); + int top_df_stride = 1; int top_df_channel_stride = 1; int top_df_batch_stride = 1; @@ -93,7 +101,14 @@ int mlo_construct_norm::mloConstructFwd() _out_pix_tile0 = 1; _out_pix_tile1 = 1; - int MAP_SZ4 = _search_params.in_width * _search_params.in_height; + auto is_in_packed = is_tensor_packed(_search_params.n_inputs, + _search_params.in_height, + _search_params.in_width, + _search_params.in_batch_stride, + _search_params.in_channel_stride, + _search_params.in_stride); + + int MAP_SZ4 = _search_params.in_width * (is_in_packed ? _search_params.in_height : 1); int read_unit; if(_norm_region == MLO_LRN_ACROSS_CHANNELS) { @@ -110,11 +125,14 @@ int mlo_construct_norm::mloConstructFwd() read_unit = 4; MAP_SZ4 = (MAP_SZ4 + 3) / 4; } + MAP_SZ4 *= (is_in_packed ? 1 : _search_params.in_height); + + assert(_out_pix_tile0 - 1 <= _norm_area && _out_pix_tile1 - 1 <= _norm_area); auto ocl_group_lg2sz0 = - static_cast(ceil(log(static_cast(_out_pix_tile0) / log(2.)))); + static_cast(ceil(log(static_cast(_out_pix_tile0)) / log(2.))); auto ocl_group_lg2sz1 = - static_cast(ceil(log(static_cast(_out_pix_tile1) / log(2.)))); + static_cast(ceil(log(static_cast(_out_pix_tile1)) / log(2.))); _kernel_file = "MIOpenLRNFwd.cl"; _kernel_name = (_norm_region == MLO_LRN_ACROSS_CHANNELS) ? "MIOpenLRNAcrossChannels4" @@ -126,9 +144,10 @@ int mlo_construct_norm::mloConstructFwd() int n_waves = (_search_params.batch_sz * MAP_SZ4 + _hw_wave_sz - 1) / _hw_wave_sz; if(n_waves <= maxComputeUnits * 8) { - MAP_SZ4 = _search_params.in_width * _search_params.in_height; + MAP_SZ4 = _search_params.in_width * (is_in_packed ? _search_params.in_height : 1); read_unit = (MAP_SZ4 % 2 == 0) ? 2 : 1; MAP_SZ4 /= read_unit; + MAP_SZ4 *= (is_in_packed ? 1 : _search_params.in_height); } } @@ -181,6 +200,9 @@ int mlo_construct_norm::mloConstructFwd() std::string(" -DMLO_LRN_PAD1=") + std::to_string(static_cast(pad)) + std::string(" -DMLO_LRN_KERNEL_SZ0=") + std::to_string(static_cast(_norm_area)) + std::string(" -DMLO_LRN_PAD0=") + std::to_string(static_cast(pad)) + + std::string(" -DMLO_LRN_PRE_PAD=") + std::to_string(static_cast(pre_pad)) + + std::string(" -DMLO_LRN_PRE_PAD1=") + std::to_string(static_cast(pre_pad)) + + std::string(" -DMLO_LRN_PRE_PAD0=") + std::to_string(static_cast(pre_pad)) + std::string(" -DMLO_LRN_N_OUTPUTS=") + std::to_string(static_cast(_search_params.n_outputs)) + std::string(" -DMLO_LRN_N_INPUTS=") + @@ -306,6 +328,9 @@ int mlo_construct_norm::mloConstructBwd() int scale_channel_stride = _search_params.out_channel_stride; int scale_batch_stride = _search_params.out_batch_stride; + if(pre_pad < 0 || pad < 0) + MIOPEN_THROW("Wrong LRN kernel size"); + _comp_options = std::string(" -DMLO_LRN_KERNEL_SZ=") + std::to_string(static_cast(_norm_area)) + std::string(" -DMLO_LRN_N_OUTPUTS=") + @@ -313,6 +338,7 @@ int mlo_construct_norm::mloConstructBwd() std::string(" -DMLO_LRN_N_CHANNELS=") + std::to_string(static_cast(_search_params.n_inputs)) + std::string(" -DMLO_LRN_PAD=") + std::to_string(static_cast(pad)) + + std::string(" -DMLO_LRN_PRE_PAD=") + std::to_string(static_cast(pre_pad)) + std::string(" -DMLO_LRN_N_HORIZ_OUT_PIX=") + std::to_string(static_cast(_out_pix_tile0)) + std::string(" -DMLO_LRN_N_VERT_OUT_PIX=") + diff --git a/test/lrn_test.cpp b/test/lrn_test.cpp index 4576edaa96..0d67d1414e 100644 --- a/test/lrn_test.cpp +++ b/test/lrn_test.cpp @@ -44,68 +44,68 @@ struct verify_lrn_foward miopen::LRNDescriptor lrn; tensor input; + verify_lrn_foward(const miopen::LRNDescriptor& plrnDesc, const tensor& pinput) + { + lrn = plrnDesc; + input = pinput; + } + tensor cpu() const { - auto output = input; + auto output = tensor{input.desc.GetLengths()}; int n_batch, channels, height, width; std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); - auto alpha = lrn.GetAlpha(); - auto beta = lrn.GetBeta(); - auto K = lrn.GetK(); - auto lrn_n = lrn.GetN(); - auto radius = (lrn.GetN() - 1) / 2; - auto mode = lrn.GetMode(); + auto alpha = lrn.GetAlpha(); + auto beta = lrn.GetBeta(); + auto K = lrn.GetK(); + auto lrn_n = lrn.GetN(); + int radius_lower = (lrn_n - 1) / 2; + int radius_upper = lrn_n / 2; + auto mode = lrn.GetMode(); - CHECK((lrn_n & 1) == 1); if(mode == miopenLRNCrossChannel) { auto alphaoverarea = alpha / lrn_n; + par_ford(n_batch, channels, height, width)([&](int b, int c, int h, int w) { - par_ford(n_batch, height, width)([&](int b, int h, int w) { - double scale = 0; - ford(channels)([&](int c) { - auto start = (c - radius) < 0 ? 0 : (c - radius); - auto end = (c + radius) > channels ? channels : (c + radius); + int start = c < radius_lower ? 0 : (c - radius_lower); + int end = (c + radius_upper + 1) > channels ? channels : (c + radius_upper + 1); - for(auto k = start; k < end; k++) - { - scale += std::pow(input(b, k, h, w), 2); - } + double scale = 0; + for(int k = start; k < end; k++) + { + scale += std::pow(input(b, k, h, w), 2); + } - scale *= alphaoverarea; - scale += K; - scale = std::pow(scale, -beta); + scale *= alphaoverarea; + scale += K; + scale = std::pow(scale, -beta); - output(b, c, h, w) = input(b, c, h, w) * scale; - }); + output(b, c, h, w) = static_cast(scale * input(b, c, h, w)); }); } else { - - par_ford(n_batch, channels)([&](int b, int c) { + double alphaoverarea = radius_upper == 0 ? 1 : alpha / (lrn_n * lrn_n); + par_ford(n_batch, channels, height, width)([&](int b, int c, int h, int w) { double scale = 0; - ford(height, width)([&](int h, int w) { - auto left = (w - radius) < 0 ? 0 : (w - radius); - auto right = (w + radius) > width ? width : (w + radius); - auto top = (h - radius) < 0 ? 0 : (h - radius); - auto bottom = (h + radius) > height ? height : (h + radius); - auto alphaoverarea = - radius == 0 ? 0 : alpha / ((right - left) * (bottom - top)); - - for(auto i = left; i < right; i++) + int left = (w - radius_lower) < 0 ? 0 : (w - radius_lower); + int right = (w + radius_upper + 1) > width ? width : (w + radius_upper + 1); + int top = (h - radius_lower) < 0 ? 0 : (h - radius_lower); + int bottom = (h + radius_upper + 1) > height ? height : (h + radius_upper + 1); + + for(int i = left; i < right; i++) + { + for(int j = top; j < bottom; j++) { - for(auto j = top; j < bottom; j++) - { - scale += std::pow(input(b, c, h, w), 2); - } + scale += std::pow(input(b, c, j, i), 2); } - scale *= alphaoverarea; - scale += K; - scale = std::pow(scale, -beta); - output(b, c, h, w) = input(b, c, h, w) * scale; - }); + } + scale *= alphaoverarea; + scale += K; + scale = std::pow(scale, -beta); + output(b, c, h, w) = static_cast(scale * input(b, c, h, w)); }); } @@ -115,7 +115,7 @@ struct verify_lrn_foward tensor gpu() const { auto&& handle = get_handle(); - auto out = input; + auto out = tensor{input.desc.GetLengths()}; auto in_dev = handle.Write(input.data); auto out_dev = handle.Write(out.data); auto alpha = lrn.GetAlpha(); @@ -152,70 +152,79 @@ struct verify_lrn_bwd tensor inputY; tensor inputDY; tensor inputX; - tensor outputDX; tensor scale; + verify_lrn_bwd(const miopen::LRNDescriptor& plrn, + const tensor& pout, + const tensor& pdout, + const tensor& pin, + const tensor& pscale) + { + lrn = plrn; + inputY = pout; + inputDY = pdout; + inputX = pin; + scale = pscale; + } + tensor cpu() const { + auto routputDX = tensor{inputX.desc.GetLengths()}; int n_batch, channels, height, width; std::tie(n_batch, channels, height, width) = miopen::tien<4>(inputY.desc.GetLengths()); - auto routputDX = outputDX; - auto alpha = lrn.GetAlpha(); - auto beta = lrn.GetBeta(); - auto lrn_n = lrn.GetN(); - auto mode = lrn.GetMode(); - auto radius = (lrn_n - 1) / 2; + auto alpha = lrn.GetAlpha(); + auto beta = lrn.GetBeta(); + auto lrn_n = lrn.GetN(); + auto mode = lrn.GetMode(); + int radius_lower = (lrn_n - 1) / 2; + int radius_upper = lrn_n / 2; if(mode == miopenLRNWithinChannel) { - par_ford(n_batch, channels)([&](int b, int c) { - ford(height, width)([&](int h, int w) { - double ydy = 0; - auto left = (w - radius) < 0 ? 0 : (w - radius); - auto right = (left + lrn_n) > width ? width : (left + lrn_n); - auto top = (h - radius) < 0 ? 0 : (h - radius); - auto bottom = (top + lrn_n) > height ? height : (top + lrn_n); - auto adjust_area = (right - left) * (bottom - top); - auto cache_ratio_value = 2 * alpha * beta / adjust_area; - - for(auto i = left; i < right; i++) + auto adjust_area = lrn_n * lrn_n; + auto cache_ratio_value = 2 * alpha * beta / adjust_area; + + par_ford(n_batch, channels, height, width)([&](int b, int c, int h, int w) { + + int left = w < radius_upper ? 0 : (w - radius_upper); + int right = (w + radius_lower + 1) > width ? width : (w + radius_lower + 1); + int top = h < radius_upper ? 0 : (h - radius_upper); + int bottom = (h + radius_lower + 1) > height ? height : (h + radius_lower + 1); + + double ydy = 0; + for(int i = left; i < right; i++) + { + for(int j = top; j < bottom; j++) { - for(auto j = top; j < bottom; j++) - { - ydy += (double(inputY(b, c, j, i) * inputDY(b, c, j, i)) / - double(scale(b, c, j, i))); - } + ydy += (double(inputY(b, c, j, i) * inputDY(b, c, j, i)) / + double(scale(b, c, j, i))); } + } - routputDX(b, c, h, w) = - static_cast(std::pow(static_cast(scale(b, c, h, w)), -beta) * - inputDY(b, c, h, w) - - cache_ratio_value * inputX(b, c, h, w) * ydy); - }); + routputDX(b, c, h, w) = static_cast( + std::pow(static_cast(scale(b, c, h, w)), -beta) * inputDY(b, c, h, w) - + cache_ratio_value * inputX(b, c, h, w) * ydy); }); } else { auto cache_ratio_value = 2 * alpha * beta / lrn_n; - par_ford(n_batch, height, width)([&](int b, int h, int w) { - ford(channels)([&](int c) { - double ydy = 0; - auto start = (c - radius) < 0 ? 0 : (c - radius); - auto end = (c + radius) > channels ? channels : (c + radius); + par_ford(n_batch, channels, height, width)([&](int b, int c, int h, int w) { + int start = c < radius_upper ? 0 : (c - radius_upper); + int end = (c + radius_lower + 1) > channels ? channels : (c + radius_lower + 1); - for(auto k = start; k < end; k++) - { - ydy += (double(inputY(b, k, h, w) * inputDY(b, k, h, w)) / - double(scale(b, k, h, w))); - } + double ydy = 0; + for(auto k = start; k < end; k++) + { + ydy += (double(inputY(b, k, h, w) * inputDY(b, k, h, w)) / + double(scale(b, k, h, w))); + } - routputDX(b, c, h, w) = - static_cast(std::pow(static_cast(scale(b, c, h, w)), -beta) * - inputDY(b, c, h, w) - - cache_ratio_value * inputX(b, c, h, w) * ydy); - }); + routputDX(b, c, h, w) = static_cast( + std::pow(static_cast(scale(b, c, h, w)), -beta) * inputDY(b, c, h, w) - + cache_ratio_value * inputX(b, c, h, w) * ydy); }); } @@ -225,7 +234,7 @@ struct verify_lrn_bwd tensor gpu() const { auto&& handle = get_handle(); - auto routputDX = outputDX; + auto routputDX = tensor{inputX.desc.GetLengths()}; auto inputY_dev = handle.Write(inputY.data); auto inputDY_dev = handle.Write(inputDY.data); auto inputX_dev = handle.Write(inputX.data); @@ -267,10 +276,10 @@ struct lrn_driver : test_driver { tensor input; - unsigned int n = 0; - double alpha = 0; - double beta = 0; - double k = 0; + unsigned int n = 1; + double alpha = 1; + double beta = 1; + double k = 1; std::string mode; std::unordered_map mode_lookup = { @@ -278,14 +287,13 @@ struct lrn_driver : test_driver lrn_driver() { - disabled_cache = true; add(input, "input", get_input_tensor(tensor_elem_gen_integer{miopen_type{} == miopenHalf ? 5 : 17})); - add(n, "N", generate_data({1, 3, 5})); - add(alpha, "alpha", generate_data({1.0})); - add(beta, "beta", generate_data({0})); - add(k, "K", generate_data({1})); + add(n, "N", generate_data({1, 4, 5})); + add(alpha, "alpha", generate_data({double(1)})); + add(beta, "beta", generate_data({double(1)})); + add(k, "K", generate_data({double(1)})); add(mode, "mode", generate_data({"Within_Channel", "Across_Channel"})); } @@ -306,20 +314,17 @@ struct lrn_driver : test_driver miopen::LRNDescriptor lrn{mode_lookup.at(miopen::ToUpper(mode)), n, {alpha, beta, k}}; - auto OutputDX = input; - auto fwd_output = verify(verify_lrn_foward{lrn, input}); - auto out = fwd_output.first; - + auto out = verify(verify_lrn_foward{lrn, input}); unsigned long max_value = miopen_type{} == miopenHalf ? 5 : 17; auto scale = tensor{n_batch, channels, height, width}.generate( tensor_elem_gen_integer{max_value}); - auto inputX = tensor{n_batch, channels, height, width}.generate( + auto dout = tensor{n_batch, channels, height, width}.generate( tensor_elem_gen_integer{max_value}); par_ford(n_batch, channels, height, width)( [&](int b, int c, int h, int w) { scale(b, c, h, w) += 1; }); - auto bwd_output = verify(verify_lrn_bwd{lrn, input, out, inputX, OutputDX, scale}); + verify(verify_lrn_bwd{lrn, out.first, dout, input, scale}); }; }; From 2f6bda070cb36ac83063a18c253a8de5096e2c7a Mon Sep 17 00:00:00 2001 From: zjing14 Date: Fri, 3 Jan 2020 23:29:08 -0600 Subject: [PATCH 05/15] fixed backward gemm workspace for group_conv (#2340) * fixed workspace for group_conv --- src/ocl/convolutionocl.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/ocl/convolutionocl.cpp b/src/ocl/convolutionocl.cpp index e5723c5c6a..e798a69493 100644 --- a/src/ocl/convolutionocl.cpp +++ b/src/ocl/convolutionocl.cpp @@ -3226,13 +3226,14 @@ void ConvolutionDescriptor::FindConvBwdDataAlgorithm(Handle& handle, time_gemm += in_n * time_col2im; if(gemm_status == miopenStatusSuccess) - record.SetValues("miopenConvolutionBwdDataAlgoGEMM", - FindDbData{ - "gemm", - time_gemm, - BackwardDataGetWorkSpaceSizeGEMM(wDesc, dyDesc), - kcache_key, - }); + record.SetValues( + "miopenConvolutionBwdDataAlgoGEMM", + FindDbData{ + "gemm", + time_gemm, + BackwardDataGetWorkSpaceSizeGEMM(wDesc, dyDesc) * group_count, + kcache_key, + }); } } #endif From c01b5d4e087ee2b21c8f8be5f547226619d8fcc6 Mon Sep 17 00:00:00 2001 From: JD Date: Sun, 5 Jan 2020 22:05:16 -0600 Subject: [PATCH 06/15] Correct perf db filename in documentation (#2343) * fix perf db filename and cmake prefix path note addition * reword perf db location --- README.md | 2 ++ doc/src/perfdatabase.md | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 120f6c62e9..1003f103eb 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,8 @@ An example cmake step can be: CXX=/opt/rocm/hcc/bin/hcc cmake -DMIOPEN_BACKEND=HIP -DCMAKE_PREFIX_PATH="/opt/rocm/hcc;/opt/rocm/hip" .. ``` +Note: When specifying the path for the `CMAKE_PREFIX_PATH` variable, do not use the `~` shorthand for the user home directory. + ### Setting Up Locations By default the install location is set to '/opt/rocm', this can be set by using `CMAKE_INSTALL_PREFIX`: diff --git a/doc/src/perfdatabase.md b/doc/src/perfdatabase.md index 98dfe88e40..bd7922e568 100644 --- a/doc/src/perfdatabase.md +++ b/doc/src/perfdatabase.md @@ -78,4 +78,4 @@ This variable allows for limiting the scope of `MIOPEN_FIND_ENFORCE`, so that on ### Updating MIOpen and the User Db -It is important to note that if the user installs a new version of MIOpen, it is recommended that the user move, or delete their old user performance database file. This will prevent older database entries from polution the configurations shipped with the newer system database. The user can find the file with the suffix `*.updb.txt` in the user perf db path. \ No newline at end of file +It is important to note that if the user installs a new version of MIOpen, it is recommended that the user move, or delete their old user performance database file. This will prevent older database entries from poluting the configurations shipped with the newer system database. The user perf db is named `miopen.udb` and is located at the user perf db path. From 3757315604923d9c108d372cd424b68098b5f871 Mon Sep 17 00:00:00 2001 From: streamhsa Date: Thu, 9 Jan 2020 01:33:34 +0530 Subject: [PATCH 07/15] MLOpen changes to support multiple ROCM installation (#2344) - To support custom path installation, CMAKE_INSTALL_PREFIX is set to the required path. - Lib SO Version is set to 1.0 through rocm_set_soversion - Updated to current HEAD commit for ROCM-CMAKE Signed-off-by: Pruthvi Madugundu --- CMakeLists.txt | 2 +- cmake/FindOpenCL.cmake | 2 ++ requirements.txt | 2 +- src/CMakeLists.txt | 5 +++-- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 41abb4a0da..d9ea96dd08 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -207,7 +207,7 @@ endif() # Online assembler find_program(MIOPEN_AMDGCN_ASSEMBLER NAMES clang - PATHS ${MIOPEN_AMDGCN_ASSEMBLER_PATH} /opt/rocm + PATHS ${MIOPEN_AMDGCN_ASSEMBLER_PATH} ${CMAKE_INSTALL_PREFIX} PATH_SUFFIXES /opencl/bin/x86_64 NO_DEFAULT_PATH ) diff --git a/cmake/FindOpenCL.cmake b/cmake/FindOpenCL.cmake index 5109b39af8..07a986226c 100644 --- a/cmake/FindOpenCL.cmake +++ b/cmake/FindOpenCL.cmake @@ -35,6 +35,7 @@ find_path(OPENCL_INCLUDE_DIRS /usr/local/cuda/include /opt/cuda/include /opt/rocm/opencl/include + ${CMAKE_INSTALL_PREFIX}/opencl/include DOC "OpenCL header file path" ) mark_as_advanced( OPENCL_INCLUDE_DIRS ) @@ -53,6 +54,7 @@ if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") /usr/local/cuda/lib /opt/cuda/lib /opt/rocm/opencl/lib + ${CMAKE_INSTALL_PREFIX}/opencl/lib ) else( ) find_library( OPENCL_LIBRARIES diff --git a/requirements.txt b/requirements.txt index 3e8570f237..fbb2953551 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -RadeonOpenCompute/rocm-cmake@3f43e2d493f24abbab4dc189a9ab12cc3ad33baf --build +RadeonOpenCompute/rocm-cmake@1abe21258481d4cf92f5bab0ef5956636c52f735 --build RadeonOpenCompute/clang-ocl@363b4f7ad8eb7b5104b9d5a3b8bf93f294d3ffae ROCmSoftwarePlatform/MIOpenGEMM@0eb1257cfaef83ea155aabd67af4437c0028db48 ROCmSoftwarePlatform/rocBLAS@cbf0dd9a26b4406300d98dbc85088568c1532faf diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 620801621d..7809975d93 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -46,7 +46,8 @@ message( STATUS "MIOpen_VERSION= ${MIOpen_VERSION}" ) message( STATUS "CMAKE_BUILD_TYPE= ${CMAKE_BUILD_TYPE}" ) # This is incremented when the ABI to the library changes -set( MIOpen_SOVERSION 1 ) +set( MIOpen_SOVERSION 1.0 ) + function(add_kernels KERNEL_FILES) set(INIT_KERNELS_LIST) @@ -448,7 +449,7 @@ add_library( MIOpen ${MIOpen_Source} ) -set_target_properties(MIOpen PROPERTIES SOVERSION 1) +rocm_set_soversion(MIOpen ${MIOpen_SOVERSION}) clang_tidy_check(MIOpen) From f18dc53826008ace6bcaa12b4ff07746fca90146 Mon Sep 17 00:00:00 2001 From: zjing14 Date: Fri, 10 Jan 2020 22:27:35 -0600 Subject: [PATCH 08/15] diff of PR 2322 (#2350) --- .../include/utility/amd_inline_asm.hpp | 4 ++++ src/solver/conv_hip_implicit_gemm_v4.cpp | 1 + src/solver/conv_hip_implicit_gemm_v4r1.cpp | 2 ++ src/solver/implicitgemm_util.hpp | 13 ++++++------- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/kernels/composable_kernel/include/utility/amd_inline_asm.hpp b/src/kernels/composable_kernel/include/utility/amd_inline_asm.hpp index 51ebfb9065..29aca7818f 100644 --- a/src/kernels/composable_kernel/include/utility/amd_inline_asm.hpp +++ b/src/kernels/composable_kernel/include/utility/amd_inline_asm.hpp @@ -5,6 +5,7 @@ namespace ck { +#if MIOPEN_USE_FP32 // outer-product: c[i,j] += inner_product(a[i], b[j]) __device__ void amd_assembly_outer_product_1x2(float a, float b0, float b1, float& c0, float& c1) { @@ -29,7 +30,9 @@ __device__ void amd_assembly_outer_product_1x4( : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3) : "v"(a), "v"(b0), "v"(b1), "v"(b2), "v"(b3), "0"(c0), "1"(c1), "2"(c2), "3"(c3)); } +#endif +#if MIOPEN_USE_FP16 // outer-product: c[i,j] += inner_product(a[i], b[j]) __device__ void amd_assembly_outer_product_1x2(half2_t a, half2_t b0, half2_t b1, float& c0, float& c1) @@ -145,6 +148,7 @@ __device__ void amd_assembly_outer_product_1x4(half4_t a, "2"(c2), "3"(c3)); // 3rd Src Acc registers for 2 half2 registers } +#endif } // namespace ck #endif diff --git a/src/solver/conv_hip_implicit_gemm_v4.cpp b/src/solver/conv_hip_implicit_gemm_v4.cpp index 07bcd93795..1e57c1ed66 100644 --- a/src/solver/conv_hip_implicit_gemm_v4.cpp +++ b/src/solver/conv_hip_implicit_gemm_v4.cpp @@ -685,6 +685,7 @@ static inline ConvSolution GetSolutionBase(const ConvolutionContext& ctx, std::string(" -DCK_PARAM_WEI_BLOCK_COPY_DST_DATA_PER_WRITE_K=") + std::to_string(WeiBlockCopyDstDataPerWrite_K) + std::string(" -DCK_PARAM_EPACK_LENGTH=") + std::to_string(GetEPackLength(ctx, false)) + std::string(" -DCK_THREADWISE_GEMM_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx) ? '1' : '0') + + std::string(" -DCK_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx) ? '1' : '0') + std::string(" -D__HIP_PLATFORM_HCC__=1") + ctx.general_compile_options; // clang-format on diff --git a/src/solver/conv_hip_implicit_gemm_v4r1.cpp b/src/solver/conv_hip_implicit_gemm_v4r1.cpp index 13f8f991d6..b6d60b036c 100644 --- a/src/solver/conv_hip_implicit_gemm_v4r1.cpp +++ b/src/solver/conv_hip_implicit_gemm_v4r1.cpp @@ -203,6 +203,7 @@ ConvSolution ConvHipImplicitGemmV4R1Fwd::GetSolution(const ConvolutionContext& c std::string(" -DCK_PARAM_WEI_BLOCK_COPY_SRC_DATA_PER_READ_E=") + std::to_string(4) + std::string(" -DCK_PARAM_WEI_BLOCK_COPY_DST_DATA_PER_WRITE_K=") + std::to_string(1) + std::string(" -DCK_THREADWISE_GEMM_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx) ? '1' : '0') + + std::string(" -DCK_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx) ? '1' : '0') + std::string(" -D__HIP_PLATFORM_HCC__=1") + ctx.general_compile_options; // clang-format on @@ -327,6 +328,7 @@ ConvSolution ConvHipImplicitGemmV4R1WrW::GetSolution(const ConvolutionContext& c std::string(" -DCK_PARAM_WEI_BLOCK_COPY_SRC_DATA_PER_READ_E=") + std::to_string(1) + std::string(" -DCK_PARAM_WEI_BLOCK_COPY_DST_DATA_PER_WRITE_K=") + std::to_string(4) + std::string(" -DCK_THREADWISE_GEMM_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx) ? '1' : '0') + + std::string(" -DCK_USE_AMD_INLINE_ASM=") + (use_amd_inline_asm(ctx) ? '1' : '0') + std::string(" -D__HIP_PLATFORM_HCC__=1") + ctx.general_compile_options; // clang-format on diff --git a/src/solver/implicitgemm_util.hpp b/src/solver/implicitgemm_util.hpp index 02b02b445c..ee171e17e7 100644 --- a/src/solver/implicitgemm_util.hpp +++ b/src/solver/implicitgemm_util.hpp @@ -155,17 +155,16 @@ static inline int RunAndMeasureSolutionBase(miopen::Handle& profile_h, static inline bool use_amd_inline_asm(const ConvolutionContext& ctx) { - bool amd_inline_asm = !miopen::IsDisabled(MIOPEN_DEBUG_IMPLICIT_GEMM_NON_XDLOPS_INLINE_ASM{}); if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx8")) - amd_inline_asm = false; + return false; - if(!(StartsWith(ctx.GetStream().GetDeviceName(), "gfx906") || - StartsWith(ctx.GetStream().GetDeviceName(), "gfx908")) && - ctx.IsFp16()) - amd_inline_asm = false; + // disable fp16 inline asm for <= gfx900 + const auto device_name = ctx.GetStream().GetDeviceName(); + if(!(StartsWith(device_name, "gfx906") || StartsWith(device_name, "gfx908")) && ctx.IsFp16()) + return false; - return amd_inline_asm; + return !miopen::IsDisabled(MIOPEN_DEBUG_IMPLICIT_GEMM_NON_XDLOPS_INLINE_ASM{}); } } // namespace solver From 545964f2c1296e57cfd45d714577181a311718a3 Mon Sep 17 00:00:00 2001 From: Daniel Lowell Date: Mon, 13 Jan 2020 15:27:44 -0600 Subject: [PATCH 09/15] Bump point version --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d9ea96dd08..a91cc64c30 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,7 +67,7 @@ if(NOT WIN32 AND NOT APPLE) set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s") endif() -rocm_setup_version(VERSION 2.2.0) +rocm_setup_version(VERSION 2.2.1) list( APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake ) include(TargetFlags) From f583da21d51e32b8c58d88d70ffbf42ebbdcc993 Mon Sep 17 00:00:00 2001 From: Artem Tamazov Date: Wed, 11 Dec 2019 02:07:12 +0300 Subject: [PATCH 10/15] CO v3 support in BN Backward asm kernel. Fixes in both BN asm kernels. (#2299) * cov3-bn-asm(01) Host code: enable CO v3 and define symbol. * cov3-bn-asm(02) Host code: fix build errors. * cov3-bn-asm(03) Refactor BN Fwd kernel. * cov3-bn-asm(04) BN Fwd kernel: errors when mandatory symbol is not defined. Remove useless MD lines. * cov3-bn-asm(05) General refactor: Split common.inc to conv_common and utilities. * cov3-bn-asm(06) BN Fwd kernel: remove redundant macro parameters. * cov3-bn-asm(07) BN Fwd kernel: fix kernarg size. * cov3-bn-asm(08) BN Fwd: unify and refactor. * cov3-bn-asm(09) BN Bwd: Reserve VCC, fix kernarg size, fix MaxFlatWorkGroupSize, fix GPR counts, unify and refactor. * cov3-bn-asm(10) BN Fwd and BN Bwd: Fix kernarg alignment in the MD. * cov3-bn-asm(11) BN Bwd: Fix LDS size in the MD v2. BN Fwd: Refactor LDS size. * cov3-bn-asm(12) BN Bwd: Fix LDS size. * cov3-bn-asm(13) BN Bwd: Added CO v3 KD and MD. * cov3-bn-asm(14) Host code: revert workaround that enables BN asm kernels. * cov3-bn-asm(16) Host code: disable CO v3 for BN Forward asm kernel. --- src/CMakeLists.txt | 3 +- src/kernels/conv1x1u.s | 3 +- src/kernels/conv1x1u_bias_activ.s | 3 +- src/kernels/conv1x1u_stride2.s | 3 +- src/kernels/conv1x1wrw.s | 3 +- src/kernels/conv3x3.s | 3 +- src/kernels/conv3x3wrw.s | 3 +- src/kernels/{common.inc => conv_common.inc} | 20 --- src/kernels/gcnAsmBNBwdTrainSpatial.s | 175 ++++++++++++++------ src/kernels/gcnAsmBNFwdTrainSpatial.s | 138 +++++++-------- src/kernels/utilities.inc | 45 +++++ src/kernels/xform_data_filter.inc | 3 +- src/kernels/xform_out.s | 3 +- src/ocl/batchnormocl.cpp | 11 +- 14 files changed, 262 insertions(+), 154 deletions(-) rename src/kernels/{common.inc => conv_common.inc} (97%) create mode 100644 src/kernels/utilities.inc diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7809975d93..c074206c91 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -239,7 +239,8 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/conv_3x3_wheel_alpha_v9_0_15.inc kernels/rocm_version.inc kernels/inst_wrappers.inc - kernels/common.inc + kernels/conv_common.inc + kernels/utilities.inc kernels/xform_data_filter.inc kernels/xform_kd_cov2.inc kernels/xform_metadata.inc diff --git a/src/kernels/conv1x1u.s b/src/kernels/conv1x1u.s index 6cee6bd886..f63490d38d 100644 --- a/src/kernels/conv1x1u.s +++ b/src/kernels/conv1x1u.s @@ -39,7 +39,8 @@ .include "rocm_version.inc" .include "gpr_alloc.inc" -.include "common.inc" +.include "utilities.inc" +.include "conv_common.inc" .include "inst_wrappers.inc" // initial state: diff --git a/src/kernels/conv1x1u_bias_activ.s b/src/kernels/conv1x1u_bias_activ.s index a5d148f505..dd36353be3 100644 --- a/src/kernels/conv1x1u_bias_activ.s +++ b/src/kernels/conv1x1u_bias_activ.s @@ -39,7 +39,8 @@ .include "rocm_version.inc" .include "gpr_alloc.inc" -.include "common.inc" +.include "utilities.inc" +.include "conv_common.inc" .include "inst_wrappers.inc" .include "neuron.inc" diff --git a/src/kernels/conv1x1u_stride2.s b/src/kernels/conv1x1u_stride2.s index fd833579c5..c98c165182 100644 --- a/src/kernels/conv1x1u_stride2.s +++ b/src/kernels/conv1x1u_stride2.s @@ -40,7 +40,8 @@ .include "rocm_version.inc" .include "gpr_alloc.inc" -.include "common.inc" +.include "utilities.inc" +.include "conv_common.inc" .include "inst_wrappers.inc" // initial state: diff --git a/src/kernels/conv1x1wrw.s b/src/kernels/conv1x1wrw.s index 0fd415c052..71f5622684 100644 --- a/src/kernels/conv1x1wrw.s +++ b/src/kernels/conv1x1wrw.s @@ -39,7 +39,8 @@ .include "rocm_version.inc" .include "gpr_alloc.inc" -.include "common.inc" +.include "utilities.inc" +.include "conv_common.inc" .include "inst_wrappers.inc" // initial state (s[0:4] are overlapped with filtersA): diff --git a/src/kernels/conv3x3.s b/src/kernels/conv3x3.s index 12fbae6059..8df204b3b1 100644 --- a/src/kernels/conv3x3.s +++ b/src/kernels/conv3x3.s @@ -40,7 +40,8 @@ .include "rocm_version.inc" .include "gpr_alloc.inc" -.include "common.inc" +.include "utilities.inc" +.include "conv_common.inc" .include "inst_wrappers.inc" // initial state (s[0:4] are overlapped with filtersA): diff --git a/src/kernels/conv3x3wrw.s b/src/kernels/conv3x3wrw.s index ff80c0a26c..f0f965f82b 100755 --- a/src/kernels/conv3x3wrw.s +++ b/src/kernels/conv3x3wrw.s @@ -65,7 +65,8 @@ gid_z = 4 .set unused_ptr_off, 0x38 .set KERNEL_ARGUMENTS_SIZE, unused_ptr_off + 8 -.include "common.inc" +.include "utilities.inc" +.include "conv_common.inc" default c_per_wave, 4 default k_per_wave, 4 diff --git a/src/kernels/common.inc b/src/kernels/conv_common.inc similarity index 97% rename from src/kernels/common.inc rename to src/kernels/conv_common.inc index 4270008852..89460e1430 100644 --- a/src/kernels/common.inc +++ b/src/kernels/conv_common.inc @@ -51,26 +51,6 @@ TYPE_INT4 = 8 .endr .endm -.macro default symbol, value - .ifnotdef \symbol - \symbol = \value - .endif -.endm - -.macro error_ifnotdef symbol - .ifnotdef \symbol - .error "\symbol is not defined." - .end - .endif -.endm - -.macro static_assert fufufu - .if !\fufufu - .error "\fufufu is false" - .end - .endif -.endm - .macro swap a, b __tmp = \a \a = \b diff --git a/src/kernels/gcnAsmBNBwdTrainSpatial.s b/src/kernels/gcnAsmBNBwdTrainSpatial.s index 79fdc76434..a14cd58e1f 100644 --- a/src/kernels/gcnAsmBNBwdTrainSpatial.s +++ b/src/kernels/gcnAsmBNBwdTrainSpatial.s @@ -23,16 +23,10 @@ * SOFTWARE. * *******************************************************************************/ - .hsa_code_object_version 2,1 - - .hsa_code_object_isa - - .text - .amdgpu_hsa_kernel miopenGcnAsmBNBwdTrainSpatial - -.include "gpr_alloc.inc" -.include "common.inc" +.include "rocm_version.inc" .include "inst_wrappers.inc" +.include "utilities.inc" +.include "gpr_alloc.inc" // kernarg layout: kernarg = 4 @@ -43,13 +37,24 @@ in_desc = 0 .set bnScale_ptr_off, 0x18 .set dscale_ptr_off, 0x20 .set dbias_ptr_off, 0x28 +// +// Variadic list of arguments. +// +error_ifnotdef MIO_BN_USESAVED +static_assert(MIO_BN_USESAVED == 0 || MIO_BN_USESAVED == 1) .if (MIO_BN_USESAVED == 0) - .set epsilon_off, 0x30 - .set inhw_off, 0x38 + .set epsilon_off, 0x30 // size 8 align 8 + .set inhw_off, 0x38 // size 4 align 4 + .set KERNARG_SIZE, 4 + inhw_off .elseif (MIO_BN_USESAVED == 1) - .set SavedMean_off, 0x30 - .set SavedInvVariance_off, 0x38 - .set inhw_off, 0x40 + .set SavedMean_off, 0x30 // size 8 align 8 + .set SavedInvVariance_off, 0x38 // size 8 align 8 + .set inhw_off, 0x40 // size 4 align 4 + .set KERNARG_SIZE, 4 + inhw_off +.endif +error_ifnotdef KERNARG_SIZE +.if KERNARG_SIZE % 8 != 0 // Kernarg alignment is 8. + .set KERNARG_SIZE, ((KERNARG_SIZE / 8) + 1) * 8 .endif .set bn_bwd_lds_mask, 0x1C @@ -65,7 +70,6 @@ fmamix_instructions_available = 0 .endif .GPR_ALLOC_BEGIN - //.SGPR_ALLOC_FROM 4 .SGPR_ALLOC_FROM 0 .SGPR_ALLOC stmp,8 @@ -79,6 +83,7 @@ fmamix_instructions_available = 0 .SGPR_ALLOC stmp9 //20 .SGPR_ALLOC stmp10 //21 .SGPR_RESERVE_XNACK + .SGPR_RESERVE_VCC .VGPR_ALLOC_FROM 0 .VGPR_ALLOC tid @@ -90,13 +95,27 @@ fmamix_instructions_available = 0 .VGPR_ALLOC qtmp4, 4 //13-v16 .VGPR_ALLOC qtmp5, 4 //v17-v20 - //.LDS_ALLOC_FROM 0 - //.LDS_ALLOC accums_lds, 10 - + .LDS_ALLOC_FROM 0 + .LDS_ALLOC UNUSED_accums_lds, 212 .GPR_ALLOC_END -miopenGcnAsmBNBwdTrainSpatial: +.if ROCM_METADATA_VERSION == 4 +.hsa_code_object_version 2,1 +.hsa_code_object_isa +.endif +.text +.globl miopenGcnAsmBNBwdTrainSpatial +.type miopenGcnAsmBNBwdTrainSpatial,@function +.p2align 8 + +.if ROCM_METADATA_VERSION == 4 +.amdgpu_hsa_kernel miopenGcnAsmBNBwdTrainSpatial +.endif + + +miopenGcnAsmBNBwdTrainSpatial: +.if ROCM_METADATA_VERSION == 4 .amd_kernel_code_t kernel_code_entry_byte_offset = 256 granulated_workitem_vgpr_count = .AUTO_VGPR_GRANULATED_COUNT @@ -109,15 +128,15 @@ miopenGcnAsmBNBwdTrainSpatial: enable_sgpr_kernarg_segment_ptr = 1 private_element_size = 1 is_ptr64 = 1 - workgroup_group_segment_byte_size = 44 - kernarg_segment_byte_size = 120 + workgroup_group_segment_byte_size = .AUTO_LDS_BYTE_SIZE + kernarg_segment_byte_size = KERNARG_SIZE wavefront_sgpr_count = .AUTO_SGPR_COUNT workitem_vgpr_count = .AUTO_VGPR_COUNT - kernarg_segment_alignment = 4 + kernarg_segment_alignment = 8 group_segment_alignment = 4 private_segment_alignment = 4 .end_amd_kernel_code_t - +.endif // s[kernarg:kernarg+1] - kernel arg base address... // V0 - work item id... // s8: group ID @@ -380,23 +399,83 @@ skip_normalization: flat_store_dword v[qtmp3:qtmp3+1], v[v_db] s_endpgm +.Lfunc_end0: + .size miopenGcnAsmBNBwdTrainSpatial, .Lfunc_end0 - miopenGcnAsmBNBwdTrainSpatial +static_assert(MIO_BN_GRP1 == 1 && MIO_BN_GRP2 == 1) // Required workgroup size and max flat workgroup size depend on this +.if ROCM_METADATA_VERSION == 5 -.Lfunc_end0: - .size miopenGcnAsmBNBwdTrainSpatial, .Lfunc_end0 - miopenGcnAsmBNBwdTrainSpatial +.rodata +.p2align 6 +.amdhsa_kernel miopenGcnAsmBNBwdTrainSpatial + .amdhsa_dx10_clamp 0 + .amdhsa_ieee_mode 0 + .amdhsa_float_round_mode_32 0 + .amdhsa_float_round_mode_16_64 0 + .amdhsa_float_denorm_mode_32 0 + .amdhsa_float_denorm_mode_16_64 3 + .amdhsa_user_sgpr_private_segment_buffer 1 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_group_segment_fixed_size .AUTO_LDS_BYTE_SIZE + .amdhsa_private_segment_fixed_size 132 + .amdhsa_next_free_sgpr __amdhsa_next_free_sgpr + .amdhsa_next_free_vgpr .AUTO_VGPR_COUNT + .amdhsa_reserve_flat_scratch __sgpr_reserve_flatscr + .amdhsa_reserve_xnack_mask __sgpr_reserve_xnack + .amdhsa_reserve_vcc __sgpr_reserve_vcc +.end_amdhsa_kernel + +.macro METADATA sc, vc, wg_x, lds_size, kernarg_size +.if (MIO_BN_USESAVED == 0) + .error "CO v3 is not supported when (MIO_BN_USESAVED == 0)" + .end +.elseif (MIO_BN_USESAVED == 1) +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: miopenGcnAsmBNBwdTrainSpatial + .symbol: miopenGcnAsmBNBwdTrainSpatial.kd + .sgpr_count: \sc + .vgpr_count: \vc + .language: "OpenCL C" + .language_version: [ 1, 2 ] + .kernarg_segment_size: \kernarg_size + .kernarg_segment_align: 8 + .group_segment_fixed_size: \lds_size + .private_segment_fixed_size: 132 + .reqd_workgroup_size: [ \wg_x, 1, 1 ] + .max_flat_workgroup_size: \wg_x + .wavefront_size: 64 + .args: + - { .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f16, .name: x_in, .address_space: global, .is_const: true } + - { .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f16, .name: dy_in, .address_space: global, .is_const: false } + - { .size: 8, .offset: 16, .value_kind: global_buffer, .value_type: f16, .name: dx_out, .address_space: global, .is_const: false } + - { .size: 8, .offset: 24, .value_kind: global_buffer, .value_type: f32, .name: bnScale, .address_space: global, .is_const: true } + - { .size: 8, .offset: 32, .value_kind: global_buffer, .value_type: f32, .name: dscale, .address_space: global, .is_const: false } + - { .size: 8, .offset: 40, .value_kind: global_buffer, .value_type: f32, .name: dbias, .address_space: global, .is_const: false } + - { .size: 8, .offset: 48, .value_kind: global_buffer, .value_type: f32, .name: savedMean, .address_space: global, .is_const: true } + - { .size: 8, .offset: 56, .value_kind: global_buffer, .value_type: f32, .name: savedInvVariance, .address_space: global, .is_const: true } + - { .size: 4, .offset: 64, .value_kind: by_value, .value_type: f32, .name: INHW } +... +.end_amdgpu_metadata +.endif +.endm // METADATA + +.elseif ROCM_METADATA_VERSION == 4 -.macro metadata wg_x, use_save_flag - .if ROCM_METADATA_VERSION == 4 - .if (\use_save_flag == 0) - .amd_amdgpu_hsa_metadata - { Version: [ 1, 0 ], +.macro METADATA sc, vc, wg_x, lds_size, kernarg_size + .if (MIO_BN_USESAVED == 0) + .amd_amdgpu_hsa_metadata + { Version: [ 1, 0 ], Kernels: - { Name: miopenGcnAsmBNBwdTrainSpatial, SymbolName: 'miopenGcnAsmBNBwdTrainSpatial@kd', Language: OpenCL C, LanguageVersion: [ 1, 2 ], Attrs: { ReqdWorkGroupSize: [ \wg_x, 1, 1 ] } CodeProps: - { KernargSegmentSize: 112, GroupSegmentFixedSize: 212, PrivateSegmentFixedSize: 132, KernargSegmentAlign: 8, WavefrontSize: 64, NumSGPRs: 32, NumVGPRs: 20, MaxFlatWorkGroupSize: 832} + { KernargSegmentSize: \kernarg_size, GroupSegmentFixedSize: \lds_size, PrivateSegmentFixedSize: 132, KernargSegmentAlign: 8, WavefrontSize: 64, NumSGPRs: \sc, NumVGPRs: \vc, MaxFlatWorkGroupSize: \wg_x} Args: - { Name: x_in , Size: 8, Align: 8, ValueKind: GlobalBuffer, ValueType: F16, TypeName: 'half*', AddrSpaceQual: Global, AccQual: ReadOnly, IsConst: true, IsRestrict: true} - { Name: dy_in , Size: 8, Align: 8, ValueKind: GlobalBuffer, ValueType: F16, TypeName: 'half*', AddrSpaceQual: Global, AccQual: Default, IsRestrict: true} @@ -406,21 +485,18 @@ skip_normalization: - { Name: dbias , Size: 8, Align: 8, ValueKind: GlobalBuffer, ValueType: F32, TypeName: 'float*', AddrSpaceQual: Global, AccQual: Default, IsRestrict: true } - { Name: epsilon, Size: 8, Align: 8, ValueKind: ByValue, ValueType: F64, TypeName: 'double', AccQual: Default } - { Name: INHW , Size: 4, Align: 4, ValueKind: ByValue, ValueType: F32, TypeName: 'float', AccQual: Default } - //- { Name: HiddenGlobalOffsetX, Size: 8, Align: 8, ValueKind: ByValue, ValueType: I64 } - //- { Name: HiddenGlobalOffsetY, Size: 8, Align: 8, ValueKind: ByValue, ValueType: I64 } - //- { Name: HiddenGlobalOffsetZ, Size: 8, Align: 8, ValueKind: ByValue, ValueType: I64 } } - } - .end_amd_amdgpu_hsa_metadata - .elseif (\use_save_flag == 1) - .amd_amdgpu_hsa_metadata - { Version: [ 1, 0 ], + } + .end_amd_amdgpu_hsa_metadata + .elseif (MIO_BN_USESAVED == 1) + .amd_amdgpu_hsa_metadata + { Version: [ 1, 0 ], Kernels: - { Name: miopenGcnAsmBNBwdTrainSpatial, SymbolName: 'miopenGcnAsmBNBwdTrainSpatial@kd', Language: OpenCL C, LanguageVersion: [ 1, 2 ], Attrs: { ReqdWorkGroupSize: [ \wg_x, 1, 1 ] } CodeProps: - { KernargSegmentSize: 112, GroupSegmentFixedSize: 212, PrivateSegmentFixedSize: 132, KernargSegmentAlign: 8, WavefrontSize: 64, NumSGPRs: 32, NumVGPRs: 20, MaxFlatWorkGroupSize: 832} + { KernargSegmentSize: \kernarg_size, GroupSegmentFixedSize: \lds_size, PrivateSegmentFixedSize: 132, KernargSegmentAlign: 8, WavefrontSize: 64, NumSGPRs: \sc, NumVGPRs: \vc, MaxFlatWorkGroupSize: \wg_x} Args: - { Name: x_in , Size: 8, Align: 8, ValueKind: GlobalBuffer, ValueType: F16, TypeName: 'half*', AddrSpaceQual: Global, AccQual: ReadOnly, IsConst: true, IsRestrict: true} - { Name: dy_in , Size: 8, Align: 8, ValueKind: GlobalBuffer, ValueType: F16, TypeName: 'half*', AddrSpaceQual: Global, AccQual: Default, IsRestrict: true} @@ -431,22 +507,13 @@ skip_normalization: - { Name: savedMean , Size: 8, Align: 8, ValueKind: GlobalBuffer, ValueType: F32, TypeName: 'float*', AddrSpaceQual: Global, AccQual: Default, IsConst: true } - { Name: savedInvVariance , Size: 8, Align: 8, ValueKind: GlobalBuffer, ValueType: F32, TypeName: 'float*', AddrSpaceQual: Global, AccQual: Default, IsConst: true } - { Name: INHW , Size: 4, Align: 4, ValueKind: ByValue, ValueType: F32, TypeName: 'float', AccQual: Default } - //- { Name: HiddenGlobalOffsetX, Size: 8, Align: 8, ValueKind: ByValue, ValueType: I64 } - //- { Name: HiddenGlobalOffsetY, Size: 8, Align: 8, ValueKind: ByValue, ValueType: I64 } - //- { Name: HiddenGlobalOffsetZ, Size: 8, Align: 8, ValueKind: ByValue, ValueType: I64 } } - } - .end_amd_amdgpu_hsa_metadata + } + .end_amd_amdgpu_hsa_metadata .endif - .endif // ROCM_METADATA_VERSION == 4 -.endm +.endm // METADATA +.endif // ROCM_METADATA_VERSION .altmacro -.macro metadata_wrapper x, y - metadata %\x, %\y -.endm - -metadata_wrapper MIO_BN_GRP0, MIO_BN_USESAVED - - +METADATA %.AUTO_SGPR_COUNT, %.AUTO_VGPR_COUNT, %MIO_BN_GRP0, %.AUTO_LDS_BYTE_SIZE, %KERNARG_SIZE diff --git a/src/kernels/gcnAsmBNFwdTrainSpatial.s b/src/kernels/gcnAsmBNFwdTrainSpatial.s index 54a18f5784..883ccf0fa7 100644 --- a/src/kernels/gcnAsmBNFwdTrainSpatial.s +++ b/src/kernels/gcnAsmBNFwdTrainSpatial.s @@ -23,23 +23,9 @@ * SOFTWARE. * *******************************************************************************/ -.hsa_code_object_version 2,1 - -.hsa_code_object_isa - -.text -.amdgpu_hsa_kernel miopenGcnAsmBNFwdTrainSpatial - - -/// \todo Better use common.inc. This requires more testing, so let's just copy macro here. -.macro static_assert fufufu - .if !\fufufu - .error "\fufufu is false" - .end - .endif -.endm - +.include "rocm_version.inc" .include "inst_wrappers.inc" +.include "utilities.inc" // kernarg layout: kernarg = 4 @@ -49,6 +35,14 @@ in_desc = 0 .set scale_ptr_off, 0x10 .set bias_ptr_off, 0x18 .set inhw_off, 0x20 +// +// Variadic list of arguments. +// All the arguments listed below are of size 8 and align 8. +// +error_ifnotdef MIO_SAVE_MEAN_VARIANCE +error_ifnotdef MIO_RUNNING_RESULT +static_assert(MIO_SAVE_MEAN_VARIANCE == 0 || MIO_SAVE_MEAN_VARIANCE == 1) +static_assert(MIO_RUNNING_RESULT == 0 || MIO_RUNNING_RESULT == 1) .if (MIO_SAVE_MEAN_VARIANCE == 1) && (MIO_RUNNING_RESULT == 1) .set expAvgFactor_off, 0x28 .set resultRunningMean_off, 0x30 @@ -56,27 +50,25 @@ in_desc = 0 .set epsilon_off, 0x40 .set resultSaveMean_off, 0x48 .set resultSaveInvVariance_off, 0x50 + .set KERNARG_SIZE, 8 + resultSaveInvVariance_off .elseif (MIO_SAVE_MEAN_VARIANCE == 0) && (MIO_RUNNING_RESULT == 1) .set expAvgFactor_off, 0x28 .set resultRunningMean_off, 0x30 .set resultRunningVariance_off, 0x38 .set epsilon_off, 0x40 - .set resultSaveMean_off, 0x0 - .set resultSaveInvVariance_off, 0x0 + .set KERNARG_SIZE, 8 + epsilon_off .elseif (MIO_SAVE_MEAN_VARIANCE == 1) && (MIO_RUNNING_RESULT == 0) - .set expAvgFactor_off, 0x0 - .set resultRunningMean_off, 0x0 - .set resultRunningVariance_off, 0x0 .set epsilon_off, 0x28 .set resultSaveMean_off, 0x30 .set resultSaveInvVariance_off, 0x38 + .set KERNARG_SIZE, 8 + resultSaveInvVariance_off .elseif (MIO_SAVE_MEAN_VARIANCE == 0) && (MIO_RUNNING_RESULT == 0) - .set expAvgFactor_off, 0x0 - .set resultRunningMean_off, 0x0 - .set resultRunningVariance_off, 0x0 .set epsilon_off, 0x28 - .set resultSaveMean_off, 0x0 - .set resultSaveInvVariance_off, 0x0 + .set KERNARG_SIZE, 8 + epsilon_off +.endif +error_ifnotdef KERNARG_SIZE +.if KERNARG_SIZE % 8 != 0 // Kernarg alignment is 8. + .set KERNARG_SIZE, ((KERNARG_SIZE / 8) + 1) * 8 .endif madmix_instructions_available = 0 @@ -89,13 +81,39 @@ fmamix_instructions_available = 0 .endif .endif +// "gpr_alloc.inc" is not used. +// Let's define appropriate symbols manually. +.set .AUTO_SGPR_COUNT, 40 +.set .AUTO_VGPR_COUNT, 16 +.if ROCM_METADATA_VERSION == 4 + .AUTO_VGPR_GRANULATED_COUNT = (.AUTO_VGPR_COUNT - 1)/4 + .AUTO_SGPR_GRANULATED_COUNT = (.AUTO_SGPR_COUNT - 1)/8 +.endif +.set .AUTO_LDS_BYTE_SIZE, 136 + + +.if ROCM_METADATA_VERSION == 4 +.hsa_code_object_version 2,1 +.hsa_code_object_isa +.endif + +.text +.globl miopenGcnAsmBNFwdTrainSpatial +.type miopenGcnAsmBNFwdTrainSpatial,@function +.p2align 8 + +.if ROCM_METADATA_VERSION == 4 +.amdgpu_hsa_kernel miopenGcnAsmBNFwdTrainSpatial +.endif + miopenGcnAsmBNFwdTrainSpatial: +.if ROCM_METADATA_VERSION == 4 .amd_kernel_code_t kernel_code_entry_byte_offset = 256 kernel_code_prefetch_byte_size = 0 - granulated_workitem_vgpr_count = 3 - granulated_wavefront_sgpr_count = 4 + granulated_workitem_vgpr_count = .AUTO_VGPR_GRANULATED_COUNT + granulated_wavefront_sgpr_count = .AUTO_SGPR_GRANULATED_COUNT float_mode = 192 enable_dx10_clamp = 1 enable_ieee_mode = 1 @@ -116,20 +134,20 @@ miopenGcnAsmBNFwdTrainSpatial: private_element_size = 1 is_ptr64 = 1 workitem_private_segment_byte_size = 132 - workgroup_group_segment_byte_size = 136 + workgroup_group_segment_byte_size = .AUTO_LDS_BYTE_SIZE gds_segment_byte_size = 0 - kernarg_segment_byte_size = 136 + kernarg_segment_byte_size = KERNARG_SIZE workgroup_fbarrier_count = 0 - wavefront_sgpr_count = 40 - workitem_vgpr_count = 16 + wavefront_sgpr_count = .AUTO_SGPR_COUNT + workitem_vgpr_count = .AUTO_VGPR_COUNT debug_wavefront_private_segment_offset_sgpr = 0 - kernarg_segment_alignment = 4 + kernarg_segment_alignment = 8 group_segment_alignment = 4 private_segment_alignment = 4 wavefront_size = 6 call_convention = -1 .end_amd_kernel_code_t - +.endif s_mov_b32 s12, s8 s_mov_b32 s13, 0 @@ -380,9 +398,15 @@ end_of_program: .Lfunc_end0: .size miopenGcnAsmBNFwdTrainSpatial, .Lfunc_end0 - miopenGcnAsmBNFwdTrainSpatial -.macro metadata wg_x, save_flag, result_running_flag - .if ROCM_METADATA_VERSION == 4 - .if (\save_flag == 1) && (\result_running_flag == 1) +static_assert(MIO_BN_GRP1 == 1 && MIO_BN_GRP2 == 1) // Required workgroup size and max flat workgroup size depend on this + +.if ROCM_METADATA_VERSION == 5 +.error "CO v3 is not supported yet" +.end + +.elseif ROCM_METADATA_VERSION == 4 + .macro METADATA sc, vc, wg_x, lds_size, kernarg_size + .if (MIO_SAVE_MEAN_VARIANCE == 1) && (MIO_RUNNING_RESULT == 1) .amd_amdgpu_hsa_metadata { Version: [ 1, 0 ], Kernels: @@ -390,7 +414,7 @@ end_of_program: Attrs: { ReqdWorkGroupSize: [ \wg_x, 1, 1 ] } CodeProps: - { KernargSegmentSize: 136, GroupSegmentFixedSize: 136, PrivateSegmentFixedSize: 132, KernargSegmentAlign: 8, WavefrontSize: 64, NumSGPRs: 40, NumVGPRs: 12, MaxFlatWorkGroupSize: \wg_x } + { KernargSegmentSize: \kernarg_size, GroupSegmentFixedSize: \lds_size, PrivateSegmentFixedSize: 132, KernargSegmentAlign: 8, WavefrontSize: 64, NumSGPRs: \sc, NumVGPRs: \vc, MaxFlatWorkGroupSize: \wg_x } Args: - { Name: in , Size: 8, Align: 8, ValueKind: GlobalBuffer, ValueType: F16, TypeName: 'half*', AddrSpaceQual: Global, AccQual: ReadOnly, IsConst: true, IsRestrict: true} - { Name: out , Size: 8, Align: 8, ValueKind: GlobalBuffer, ValueType: F16, TypeName: 'half*', AddrSpaceQual: Global, AccQual: Default, IsRestrict: true} @@ -403,13 +427,10 @@ end_of_program: - { Name: epsilon, Size: 8, Align: 8, ValueKind: ByValue, ValueType: F64, TypeName: 'double', AccQual: Default } - { Name: resultSaveMean, Size: 8, Align: 8, ValueKind: GlobalBuffer, ValueType: F32, TypeName: 'float*', AddrSpaceQual: Global, AccQual: Default, IsRestrict: true} - { Name: resultSaveInvVariance, Size: 8, Align: 8, ValueKind: GlobalBuffer, ValueType: F32, TypeName: 'float*', AddrSpaceQual: Global, AccQual: Default, IsRestrict: true} - //- { Name: HiddenGlobalOffsetX, Size: 8, Align: 8, ValueKind: ByValue, ValueType: I64 } - //- { Name: HiddenGlobalOffsetY, Size: 8, Align: 8, ValueKind: ByValue, ValueType: I64 } - //- { Name: HiddenGlobalOffsetZ, Size: 8, Align: 8, ValueKind: ByValue, ValueType: I64 } } } .end_amd_amdgpu_hsa_metadata - .elseif (\save_flag == 0) && (\result_running_flag == 1) + .elseif (MIO_SAVE_MEAN_VARIANCE == 0) && (MIO_RUNNING_RESULT == 1) .amd_amdgpu_hsa_metadata { Version: [ 1, 0 ], Kernels: @@ -417,7 +438,7 @@ end_of_program: Attrs: { ReqdWorkGroupSize: [ \wg_x, 1, 1 ] } CodeProps: - { KernargSegmentSize: 136, GroupSegmentFixedSize: 136, PrivateSegmentFixedSize: 132, KernargSegmentAlign: 8, WavefrontSize: 64, NumSGPRs: 40, NumVGPRs: 12, MaxFlatWorkGroupSize: \wg_x } + { KernargSegmentSize: \kernarg_size, GroupSegmentFixedSize: \lds_size, PrivateSegmentFixedSize: 132, KernargSegmentAlign: 8, WavefrontSize: 64, NumSGPRs: \sc, NumVGPRs: \vc, MaxFlatWorkGroupSize: \wg_x } Args: - { Name: in , Size: 8, Align: 8, ValueKind: GlobalBuffer, ValueType: F16, TypeName: 'half*', AddrSpaceQual: Global, AccQual: ReadOnly, IsConst: true, IsRestrict: true} - { Name: out , Size: 8, Align: 8, ValueKind: GlobalBuffer, ValueType: F16, TypeName: 'half*', AddrSpaceQual: Global, AccQual: Default, IsRestrict: true} @@ -428,13 +449,10 @@ end_of_program: - { Name: resultRunningMean, Size: 8, Align: 8, ValueKind: GlobalBuffer, ValueType: F32, TypeName: 'float*', AddrSpaceQual: Global, AccQual: Default, IsRestrict: true} - { Name: resultRunningVariance, Size: 8, Align: 8, ValueKind: GlobalBuffer, ValueType: F32, TypeName: 'float*', AddrSpaceQual: Global, AccQual: Default, IsRestrict: true} - { Name: epsilon, Size: 8, Align: 8, ValueKind: ByValue, ValueType: F64, TypeName: 'double', AccQual: Default } - //- { Name: HiddenGlobalOffsetX, Size: 8, Align: 8, ValueKind: ByValue, ValueType: I64 } - //- { Name: HiddenGlobalOffsetY, Size: 8, Align: 8, ValueKind: ByValue, ValueType: I64 } - //- { Name: HiddenGlobalOffsetZ, Size: 8, Align: 8, ValueKind: ByValue, ValueType: I64 } } } .end_amd_amdgpu_hsa_metadata - .elseif (\save_flag == 1) && (\result_running_flag == 0) + .elseif (MIO_SAVE_MEAN_VARIANCE == 1) && (MIO_RUNNING_RESULT == 0) .amd_amdgpu_hsa_metadata { Version: [ 1, 0 ], Kernels: @@ -442,7 +460,7 @@ end_of_program: Attrs: { ReqdWorkGroupSize: [ \wg_x, 1, 1 ] } CodeProps: - { KernargSegmentSize: 136, GroupSegmentFixedSize: 136, PrivateSegmentFixedSize: 132, KernargSegmentAlign: 8, WavefrontSize: 64, NumSGPRs: 40, NumVGPRs: 12, MaxFlatWorkGroupSize: \wg_x } + { KernargSegmentSize: \kernarg_size, GroupSegmentFixedSize: \lds_size, PrivateSegmentFixedSize: 132, KernargSegmentAlign: 8, WavefrontSize: 64, NumSGPRs: \sc, NumVGPRs: \vc, MaxFlatWorkGroupSize: \wg_x } Args: - { Name: in , Size: 8, Align: 8, ValueKind: GlobalBuffer, ValueType: F16, TypeName: 'half*', AddrSpaceQual: Global, AccQual: ReadOnly, IsConst: true, IsRestrict: true} - { Name: out , Size: 8, Align: 8, ValueKind: GlobalBuffer, ValueType: F16, TypeName: 'half*', AddrSpaceQual: Global, AccQual: Default, IsRestrict: true} @@ -452,13 +470,10 @@ end_of_program: - { Name: epsilon, Size: 8, Align: 8, ValueKind: ByValue, ValueType: F64, TypeName: 'double', AccQual: Default } - { Name: resultSaveMean, Size: 8, Align: 8, ValueKind: GlobalBuffer, ValueType: F32, TypeName: 'float*', AddrSpaceQual: Global, AccQual: Default, IsRestrict: true} - { Name: resultSaveInvVariance, Size: 8, Align: 8, ValueKind: GlobalBuffer, ValueType: F32, TypeName: 'float*', AddrSpaceQual: Global, AccQual: Default, IsRestrict: true} - //- { Name: HiddenGlobalOffsetX, Size: 8, Align: 8, ValueKind: ByValue, ValueType: I64 } - //- { Name: HiddenGlobalOffsetY, Size: 8, Align: 8, ValueKind: ByValue, ValueType: I64 } - //- { Name: HiddenGlobalOffsetZ, Size: 8, Align: 8, ValueKind: ByValue, ValueType: I64 } } } .end_amd_amdgpu_hsa_metadata - .elseif (\save_flag == 0) && (\result_running_flag == 0) + .elseif (MIO_SAVE_MEAN_VARIANCE == 0) && (MIO_RUNNING_RESULT == 0) .amd_amdgpu_hsa_metadata { Version: [ 1, 0 ], Kernels: @@ -466,7 +481,7 @@ end_of_program: Attrs: { ReqdWorkGroupSize: [ \wg_x, 1, 1 ] } CodeProps: - { KernargSegmentSize: 136, GroupSegmentFixedSize: 136, PrivateSegmentFixedSize: 132, KernargSegmentAlign: 8, WavefrontSize: 64, NumSGPRs: 40, NumVGPRs: 12, MaxFlatWorkGroupSize: \wg_x } + { KernargSegmentSize: \kernarg_size, GroupSegmentFixedSize: \lds_size, PrivateSegmentFixedSize: 132, KernargSegmentAlign: 8, WavefrontSize: 64, NumSGPRs: \sc, NumVGPRs: \vc, MaxFlatWorkGroupSize: \wg_x } Args: - { Name: in , Size: 8, Align: 8, ValueKind: GlobalBuffer, ValueType: F16, TypeName: 'half*', AddrSpaceQual: Global, AccQual: ReadOnly, IsConst: true, IsRestrict: true} - { Name: out , Size: 8, Align: 8, ValueKind: GlobalBuffer, ValueType: F16, TypeName: 'half*', AddrSpaceQual: Global, AccQual: Default, IsRestrict: true} @@ -474,18 +489,12 @@ end_of_program: - { Name: bias , Size: 8, Align: 8, ValueKind: GlobalBuffer, ValueType: F32, TypeName: 'float*', AddrSpaceQual: Constant, AccQual: ReadOnly, IsConst: true, IsRestrict: true} - { Name: INHW , Size: 4, Align: 4, ValueKind: ByValue, ValueType: F32, TypeName: 'float', AccQual: Default } - { Name: epsilon, Size: 8, Align: 8, ValueKind: ByValue, ValueType: F64, TypeName: 'double', AccQual: Default } - //- { Name: HiddenGlobalOffsetX, Size: 8, Align: 8, ValueKind: ByValue, ValueType: I64 } - //- { Name: HiddenGlobalOffsetY, Size: 8, Align: 8, ValueKind: ByValue, ValueType: I64 } - //- { Name: HiddenGlobalOffsetZ, Size: 8, Align: 8, ValueKind: ByValue, ValueType: I64 } } } .end_amd_amdgpu_hsa_metadata .endif - .else - .error "Unsupported ROCM_METADATA_VERSION" - .end - .endif -.endm + .endm +.endif //.if MIO_BN_GRP0 == 832 // metadata 832 @@ -494,12 +503,7 @@ end_of_program: //.endif .altmacro -.macro metadata_wrapper wg_x, save_flag, result_running_flag - metadata %\wg_x, %\save_flag, %\result_running_flag -.endm - -static_assert(MIO_BN_GRP1 == 1 && MIO_BN_GRP2 == 1) -metadata_wrapper MIO_BN_GRP0, MIO_SAVE_MEAN_VARIANCE, MIO_RUNNING_RESULT +METADATA %.AUTO_SGPR_COUNT, %.AUTO_VGPR_COUNT, %MIO_BN_GRP0, %.AUTO_LDS_BYTE_SIZE, %KERNARG_SIZE //metadata 1024 //metadata 832 diff --git a/src/kernels/utilities.inc b/src/kernels/utilities.inc new file mode 100644 index 0000000000..ff799eb98f --- /dev/null +++ b/src/kernels/utilities.inc @@ -0,0 +1,45 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2019 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +.macro default symbol, value + .ifnotdef \symbol + \symbol = \value + .endif +.endm + +.macro error_ifnotdef symbol + .ifnotdef \symbol + .error "\symbol is not defined." + .end + .endif +.endm + +.macro static_assert fufufu + .if !\fufufu + .error "\fufufu is false" + .end + .endif +.endm diff --git a/src/kernels/xform_data_filter.inc b/src/kernels/xform_data_filter.inc index 69e6babba4..41521e333f 100644 --- a/src/kernels/xform_data_filter.inc +++ b/src/kernels/xform_data_filter.inc @@ -26,7 +26,8 @@ .include "rocm_version.inc" .include "gpr_alloc.inc" .include "inst_wrappers.inc" -.include "common.inc" +.include "utilities.inc" +.include "conv_common.inc" .altmacro // limits: diff --git a/src/kernels/xform_out.s b/src/kernels/xform_out.s index b203d41823..58198beca8 100644 --- a/src/kernels/xform_out.s +++ b/src/kernels/xform_out.s @@ -34,7 +34,8 @@ .include "rocm_version.inc" .include "inst_wrappers.inc" .include "gpr_alloc.inc" -.include "common.inc" +.include "utilities.inc" +.include "conv_common.inc" // kernarg layout: diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp index 3f355d6883..280b0d5776 100644 --- a/src/ocl/batchnormocl.cpp +++ b/src/ocl/batchnormocl.cpp @@ -292,7 +292,8 @@ void BatchNormForwardTraining(Handle& handle, NHW_value.f32 = static_cast(in_nhw / (in_nhw - 1.0)); // clang-format off - parms = " -Wa,-defsym,ROCM_METADATA_VERSION=4" + parms = std::string() + + " -Wa,-defsym,ROCM_METADATA_VERSION=" + (ctx.rmv.UseV3() ? "5" : "4") + " -Wa,-defsym,MIOPEN_USE_FP16=" + std::to_string(static_cast(bfp16parm)) + " -Wa,-defsym,MIOPEN_USE_FP32=" + std::to_string(static_cast(bfp32parm)) + " -Wa,-defsym,MIOPEN_USE_FPMIX=" + std::to_string(static_cast(bfpmixparm)) + @@ -470,7 +471,8 @@ void BatchNormForwardTraining(Handle& handle, NHW_value.f32 = static_cast(in_nhw / (in_nhw - 1.0)); // clang-format off - parms = " -Wa,-defsym,ROCM_METADATA_VERSION=4" + parms = std::string() + + " -Wa,-defsym,ROCM_METADATA_VERSION=" + (ctx.rmv.UseV3() ? "5" : "4") + " -Wa,-defsym,MIOPEN_USE_FP16=" + std::to_string(static_cast(bfp16parm)) + " -Wa,-defsym,MIOPEN_USE_FP32=" + std::to_string(static_cast(bfp32parm)) + " -Wa,-defsym,MIOPEN_USE_FPMIX=" + std::to_string(static_cast(bfpmixparm)) + @@ -1102,7 +1104,7 @@ void BatchNormBackward(Handle& handle, std::string parms; if((n > 64) && (n % 2 == 0) && (variant == 3) && (bfpmixparm) && (useSaved) && - ctx.use_asm_kernels && ctx.rmv.IsV2()) + ctx.use_asm_kernels && ctx.rmv.IsV2orV3()) { kernel_name = "miopenGcnAsmBNBwdTrainSpatial"; program_name = "gcnAsmBNBwdTrainSpatial.s"; @@ -1120,7 +1122,8 @@ void BatchNormBackward(Handle& handle, NHW_value.f32 = static_cast(in_nhw); // clang-format off - parms = " -Wa,-defsym,ROCM_METADATA_VERSION=4" + parms = std::string() + + " -Wa,-defsym,ROCM_METADATA_VERSION=" + (ctx.rmv.UseV3() ? "5" : "4") + " -Wa,-defsym,MIOPEN_USE_FP16=" + std::to_string(static_cast(bfp16parm)) + " -Wa,-defsym,MIOPEN_USE_FP32=" + std::to_string(static_cast(bfp32parm)) + " -Wa,-defsym,MIOPEN_USE_FPMIX=" + std::to_string(static_cast(bfpmixparm)) + From 6778e4f31fbf559105513d49da81cf18ca87938c Mon Sep 17 00:00:00 2001 From: Artem Tamazov Date: Tue, 17 Dec 2019 19:02:24 +0300 Subject: [PATCH 11/15] CO v3 auto-detection (#2306) * cov3-autodetect(01) Remove useless env.var from OCL BE. Update documentation. * cov3-autodetect(02) Autodetection: ROCm 2.10 supports both v2 and v3 --- doc/src/DebugAndLogging.md | 12 ++++++++---- src/include/miopen/mlo_internal.hpp | 12 +++++------- src/mlo_dir_conv.cpp | 16 ++++++++++------ src/ocl/clhelper.cpp | 7 ------- 4 files changed, 23 insertions(+), 24 deletions(-) diff --git a/doc/src/DebugAndLogging.md b/doc/src/DebugAndLogging.md index 49fc40c7f2..80c415b25a 100644 --- a/doc/src/DebugAndLogging.md +++ b/doc/src/DebugAndLogging.md @@ -155,10 +155,14 @@ More information on logging with RocBlas can be found [here](https://github.com/ ### Code Object (CO) version selection (EXPERIMENTAL) -currently, ROCm fully supports Code Object version 2 (Co v2). The support for version 3 (CO v3) is being gradually introduced. These variables allows for experimenting and triaging problems related to CO version: -* `MIOPEN_DEBUG_AMD_ROCM_METADATA_ENFORCE` - Overrides CO version auto-detection implemented in the library. `0` or unset - disable overriding (the default), `1` - enforces CO v2, `2` - behave as if both CO v2 and v3 are supported, `2` - enforces CO v3. -* `MIOPEN_DEBUG_AMD_ROCM_METADATA_PREFER_NEWER` - This variable affects only Solutions available in both v2 and v3 code objects, and is intended to use only when ROCm supports both CO v2 and CO v3. By default, the older format is used (CO v2). When this variable is _enabled_, the behavior is reversed. -* `MIOPEN_DEBUG_AMD_OPENCL_ENFORCE_COV3` - Enforces CO v3 for OpenCL kernels. +Different ROCm versions use Code Object files of different versions (or, in other words, formats). The library uses suitable version automatically. The following variables allow for experimenting and triaging possible problems related to CO version: +* `MIOPEN_DEBUG_AMD_ROCM_METADATA_ENFORCE` - Affects kernels written in GCN assembly language. Overrides CO version auto-detection implemented in the library. + * `0` or unset - Automatically detect CO version (the default). + * `1` - Enforce CO v2. + * `2` - Behave as if both CO v2 and v3 are supported (see `MIOPEN_DEBUG_AMD_ROCM_METADATA_PREFER_NEWER`). + * `3` - Enforce CO v3. +* `MIOPEN_DEBUG_AMD_ROCM_METADATA_PREFER_NEWER` - This variable affects assembly kernels available in both v2 and v3 code object formats and works only when ROCm supports both CO v2 and CO v3. By default, the older format is used (CO v2). When this variable is _enabled_, the behavior is reversed. +* `MIOPEN_DEBUG_AMD_OPENCL_ENFORCE_COV3` - Enforces CO v3 for OpenCL kernels. Works with HIP backend only (`cmake ... -DMIOPEN_BACKEND=HIP...`). ### `MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_WORKSPACE_MAX` diff --git a/src/include/miopen/mlo_internal.hpp b/src/include/miopen/mlo_internal.hpp index e2b021291d..07ef738842 100644 --- a/src/include/miopen/mlo_internal.hpp +++ b/src/include/miopen/mlo_internal.hpp @@ -132,13 +132,11 @@ class rocm_meta_version int val = Unknown; public: - static constexpr int - Unknown = 0, // Unset env.vars read as 0. - AMDHSA_COv2 = 1, // 1.0, see https://llvm.org/docs/AMDGPUUsage.html#code-object-metadata - AMDHSA_COv2_COv3 = 2, // E.g. ROCm 2.6 supports both. - AMDHSA_COv3 = 3, - Default = - AMDHSA_COv2; // Assumption for HIP backend. To be updated together with ROCm release. + static constexpr int Unknown = 0, // Unset env.vars read as 0. + AMDHSA_COv2 = 1, // V2 metadata, https://llvm.org/docs/AMDGPUUsage.html + AMDHSA_COv2_COv3 = 2, // E.g. ROCm 2.10 supports both. + AMDHSA_COv3 = 3, // V3 metadata, https://llvm.org/docs/AMDGPUUsage.html + Default = AMDHSA_COv2; // Used when auto-detection fails. private: static constexpr int End = 4, Begin = Unknown; diff --git a/src/mlo_dir_conv.cpp b/src/mlo_dir_conv.cpp index 292fc07be2..4ed715f7dd 100644 --- a/src/mlo_dir_conv.cpp +++ b/src/mlo_dir_conv.cpp @@ -361,8 +361,11 @@ static rocm_meta_version AmdRocmMetadataVersionDetect(const miopen::ConvolutionC size_t num_begin = platform_version.find('('); if(num_begin != std::string::npos) { - // int num = std::stoi(platform_version.substr(num_begin + 1)); - rmv = rocm_meta_version::AMDHSA_COv2; + const int num = std::stoi(platform_version.substr(num_begin + 1)); + if(num >= 3029) // ROCm 2.10 RC 1341 + rmv = rocm_meta_version::AMDHSA_COv2_COv3; + else + rmv = rocm_meta_version::AMDHSA_COv2; } else { @@ -370,10 +373,11 @@ static rocm_meta_version AmdRocmMetadataVersionDetect(const miopen::ConvolutionC } #else (void)context; - rmv = rocm_meta_version::Default; - /// This is only to print information onto console. - /// \todo Consider removing this call in installable builds. - (void)miopen::HipGetHccVersion(); + if(miopen::HipGetHccVersion() >= + miopen::external_tool_version_t{2, 10, 19392}) // ROCm 2.10 RC 1341 + rmv = rocm_meta_version::AMDHSA_COv2_COv3; + else + rmv = rocm_meta_version::Default; #endif // MIOPEN_BACKEND_OPENCL } MIOPEN_LOG_NQI( diff --git a/src/ocl/clhelper.cpp b/src/ocl/clhelper.cpp index 0b4fef94b3..b1d0bf12b8 100644 --- a/src/ocl/clhelper.cpp +++ b/src/ocl/clhelper.cpp @@ -40,8 +40,6 @@ #include #include -MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_AMD_OPENCL_ENFORCE_COV3) - namespace miopen { void ParseDevName(std::string& name) @@ -190,11 +188,6 @@ ClProgramPtr LoadProgram(cl_context ctx, #endif #endif params += " -cl-std=CL1.2"; - if(miopen::IsEnabled(MIOPEN_DEBUG_AMD_OPENCL_ENFORCE_COV3{})) - { - /// \todo Seems not working with ROCm 2.6 - params += " -Wf,-Xclang,-target-feature,-Xclang,+code-object-v3"; - } BuildProgram(result.get(), device, params); return result; } From 0c4808b70ffa6805928330f1361748253ddf772d Mon Sep 17 00:00:00 2001 From: Artem Tamazov Date: Fri, 20 Dec 2019 01:30:56 +0300 Subject: [PATCH 12/15] CO v3: Make it the default for ROCm 2.10. Update docs. (#2308) * cov3-autodetect(01) Remove useless env.var from OCL BE. Update documentation. * cov3-autodetect(02) Autodetection: ROCm 2.10 supports both v2 and v3 * cov3-prefer(01) Implemented. * cov3-prefer(03) Rework rocm_meta_version::UseV3() --- doc/src/DebugAndLogging.md | 12 ++++++------ src/mlo_dir_conv.cpp | 21 ++++++++++++--------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/doc/src/DebugAndLogging.md b/doc/src/DebugAndLogging.md index 80c415b25a..bdcd7228a7 100644 --- a/doc/src/DebugAndLogging.md +++ b/doc/src/DebugAndLogging.md @@ -156,12 +156,12 @@ More information on logging with RocBlas can be found [here](https://github.com/ ### Code Object (CO) version selection (EXPERIMENTAL) Different ROCm versions use Code Object files of different versions (or, in other words, formats). The library uses suitable version automatically. The following variables allow for experimenting and triaging possible problems related to CO version: -* `MIOPEN_DEBUG_AMD_ROCM_METADATA_ENFORCE` - Affects kernels written in GCN assembly language. Overrides CO version auto-detection implemented in the library. - * `0` or unset - Automatically detect CO version (the default). - * `1` - Enforce CO v2. - * `2` - Behave as if both CO v2 and v3 are supported (see `MIOPEN_DEBUG_AMD_ROCM_METADATA_PREFER_NEWER`). - * `3` - Enforce CO v3. -* `MIOPEN_DEBUG_AMD_ROCM_METADATA_PREFER_NEWER` - This variable affects assembly kernels available in both v2 and v3 code object formats and works only when ROCm supports both CO v2 and CO v3. By default, the older format is used (CO v2). When this variable is _enabled_, the behavior is reversed. +* `MIOPEN_DEBUG_AMD_ROCM_METADATA_ENFORCE` - Affects kernels written in GCN assembly language. + * `0` or unset - Automatically detect the required CO version and assemble to that version. This is the default. + * `1` - Do not auto-detect Code Object version, always assemble v2 Code Objects. + * `2` - Behave as if both CO v2 and v3 are supported (see `MIOPEN_DEBUG_AMD_ROCM_METADATA_PREFER_OLDER`). + * `3` - Always assemble v3 Code Objects. +* `MIOPEN_DEBUG_AMD_ROCM_METADATA_PREFER_OLDER` - This variable affects only assembly kernels, and only when ROCm supports both CO v2 and CO v3 (like ROCm 2.10). By default, the newer format is used (CO v3). When this variable is _enabled_, the behavior is reversed. * `MIOPEN_DEBUG_AMD_OPENCL_ENFORCE_COV3` - Enforces CO v3 for OpenCL kernels. Works with HIP backend only (`cmake ... -DMIOPEN_BACKEND=HIP...`). ### `MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_WORKSPACE_MAX` diff --git a/src/mlo_dir_conv.cpp b/src/mlo_dir_conv.cpp index 4ed715f7dd..b605e10451 100644 --- a/src/mlo_dir_conv.cpp +++ b/src/mlo_dir_conv.cpp @@ -62,7 +62,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_GCN_ASM_KERNELS) MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_AMD_ROCM_PRECOMPILED_BINARIES) MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_OPENCL_CONVOLUTIONS) MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_AMD_ROCM_METADATA_ENFORCE) -MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_AMD_ROCM_METADATA_PREFER_NEWER) +MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_AMD_ROCM_METADATA_PREFER_OLDER) // Only select first applicable implicitgemm kernel due to slow compilation time // (issue SWDEV-201055) and tuning @@ -309,17 +309,20 @@ static bool IsAmdRocmOpencl(const miopen::ConvolutionContext& context) /// However, when both ROCm and Solver are able to support both code object formats, /// these is no objective criterion for making a decision. The following behavior /// is implemented: -/// * By default, the older format is used (CO v2). -/// * If MIOPEN_DEBUG_AMD_ROCM_METADATA_PREFER_NEWER is set to 1, then -/// the behavior is reversed and CO v3 is selected. +/// * By default, the newer format is used (CO v3). +/// * If MIOPEN_DEBUG_AMD_ROCM_METADATA_PREFER_OLDER is set to 1, then +/// the behavior is reversed and CO v2 is selected. /// -/// FIXME move this out of the rocm_meta_version class. +/// \todo Dismiss MIOPEN_DEBUG_AMD_ROCM_METADATA_PREFER_OLDER (and, possibly, +/// rocm_meta_version::AMDHSA_COv2_COv3) as soon as MIOpen drops support for the +/// ROCm runtimes that can load and run both v2 and v3 Code Objects. +/// +/// \todo Move this out of the rocm_meta_version class. bool rocm_meta_version::UseV3() const { - if(miopen::IsEnabled(MIOPEN_DEBUG_AMD_ROCM_METADATA_PREFER_NEWER{})) - return val == AMDHSA_COv3 || val == AMDHSA_COv2_COv3; - else - return val == AMDHSA_COv3; + if(val == AMDHSA_COv2_COv3) + return !miopen::IsEnabled(MIOPEN_DEBUG_AMD_ROCM_METADATA_PREFER_OLDER{}); + return (val == AMDHSA_COv3); } static std::ostream& operator<<(std::ostream& os, const rocm_meta_version& rmv) From 8e096a44e2c162f106b7bd60935f8f278c4e3955 Mon Sep 17 00:00:00 2001 From: Daniel Lowell Date: Wed, 22 Jan 2020 12:13:16 -0600 Subject: [PATCH 13/15] Updated documentation for 2.2.1 --- doc/src/releasenotes.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/doc/src/releasenotes.md b/doc/src/releasenotes.md index 37446ee731..b6a749cc3a 100644 --- a/doc/src/releasenotes.md +++ b/doc/src/releasenotes.md @@ -3,6 +3,22 @@ +### 01/24/2020 [ 2.2.1 ] + +- This release contains bug fixes, documentation updates, and further code object version 3 support + + +Changes: + +- Added support for multiple ROCm installations +- Added additional support for code object v3 +- Fixed issue with incorrect LRN calculation [#127](https://github.com/ROCmSoftwarePlatform/MIOpen/issues/127) +- Fixed incorrect performance database documentation +- Fixed issue with incorrect workspace calculation in group convolutions +- Fixed issue with unsupported hardware instructions used with inline assembly + + + ### 12/19/2019 [ 2.2.0 ] - This release contains bug fixes, performance improvements, and expanded applicability for specific convolutional algorithms. From b4676d24f5c40be61432cedb36d7eaf426ce20a5 Mon Sep 17 00:00:00 2001 From: Paul Fultz II Date: Mon, 27 Jan 2020 22:40:52 -0600 Subject: [PATCH 14/15] Add search paths for assembler used in hcc (#2383) --- CMakeLists.txt | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a91cc64c30..ded66d7030 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -207,8 +207,15 @@ endif() # Online assembler find_program(MIOPEN_AMDGCN_ASSEMBLER NAMES clang - PATHS ${MIOPEN_AMDGCN_ASSEMBLER_PATH} ${CMAKE_INSTALL_PREFIX} - PATH_SUFFIXES /opencl/bin/x86_64 + PATHS + ${MIOPEN_AMDGCN_ASSEMBLER_PATH} + /opt/rocm + /opt/rocm/hcc + ${CMAKE_INSTALL_PREFIX} + ${CMAKE_INSTALL_PREFIX}/hcc + PATH_SUFFIXES + /opencl/bin/x86_64 + /bin NO_DEFAULT_PATH ) message(STATUS "AMDGCN assembler: ${MIOPEN_AMDGCN_ASSEMBLER}") From 92186831ab35f8bbbd38fee52e9895879cf12fb1 Mon Sep 17 00:00:00 2001 From: JD Date: Wed, 29 Jan 2020 08:34:13 -0600 Subject: [PATCH 15/15] Fix for SWDEV-220166 (#2387) * check for hcc * add comment for ticket reference * address PR comments --- src/ocl/gcn_asm_utils.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/ocl/gcn_asm_utils.cpp b/src/ocl/gcn_asm_utils.cpp index 78029aa3ae..8c6d405843 100644 --- a/src/ocl/gcn_asm_utils.cpp +++ b/src/ocl/gcn_asm_utils.cpp @@ -100,7 +100,11 @@ bool ValidateGcnAssemblerImpl() std::string clang_result_line; std::getline(clang_stdout, clang_result_line); MIOPEN_LOG_NQI2(clang_result_line); - if(clang_result_line.find("clang") != std::string::npos) + if(clang_result_line.find("HCC") != std::string::npos) + // Temporary fix for SWDEV-220166 which causes clang to report unknown + // architecture for AMD GCN + return true; + else if(clang_result_line.find("clang") != std::string::npos) { while(!clang_stdout.eof()) {