Skip to content

Commit

Permalink
[NHWC][asm igemm][GFX908][GFX90a] fix fwd kernel that have computatio…
Browse files Browse the repository at this point in the history
…n error in small cyx case (#1137)

* add gfx908 fwd kernel for fix small cyx

* add gfx90a fwd kernel for fix small cyx

* add ctest for the failed case
  • Loading branch information
carlushuang authored Sep 6, 2021
1 parent 4e7b1ab commit 10a787b
Show file tree
Hide file tree
Showing 303 changed files with 1,925 additions and 12,552 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,47 +23,9 @@
* SOFTWARE.
*
*******************************************************************************/
; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0)
; generated by igemm_codegen.py (32a41f791dcf0139e95f217f3905939fbbae794c)
;
.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp
s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer]
s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer]
s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift]
.endm

.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp
.mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp
s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot]
s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp]
.endm

.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp
v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer]
v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer]
v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp]
.endm

.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp
.mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp
v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot]
v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp]
.endm

.macro .v_clear_acc_c a, num
_a = \a
.rept \num
v_accvgpr_write_b32 a[_a], 0
_a = _a + 1
.endr
.endm

.macro .v_clear_nc vid, num
_v = \vid
.rept \num
v_mov_b32 v[_v], 0
_v = _v + 1
.endr
.endm
.include "igemm_fwd_gtcx_nhwc_fp16_utils.inc"

;----------------------------------------------------------
; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,47 +23,9 @@
* SOFTWARE.
*
*******************************************************************************/
; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0)
; generated by igemm_codegen.py (32a41f791dcf0139e95f217f3905939fbbae794c)
;
.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp
s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer]
s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer]
s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift]
.endm

.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp
.mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp
s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot]
s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp]
.endm

.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp
v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer]
v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer]
v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp]
.endm

.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp
.mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp
v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot]
v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp]
.endm

.macro .v_clear_acc_c a, num
_a = \a
.rept \num
v_accvgpr_write_b32 a[_a], 0
_a = _a + 1
.endr
.endm

.macro .v_clear_nc vid, num
_v = \vid
.rept \num
v_mov_b32 v[_v], 0
_v = _v + 1
.endr
.endm
.include "igemm_fwd_gtcx_nhwc_fp16_utils.inc"

;----------------------------------------------------------
; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x2x1_1x4x1x64_gkgs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,47 +23,9 @@
* SOFTWARE.
*
*******************************************************************************/
; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0)
; generated by igemm_codegen.py (32a41f791dcf0139e95f217f3905939fbbae794c)
;
.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp
s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer]
s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer]
s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift]
.endm

.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp
.mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp
s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot]
s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp]
.endm

.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp
v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer]
v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer]
v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp]
.endm

.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp
.mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp
v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot]
v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp]
.endm

.macro .v_clear_acc_c a, num
_a = \a
.rept \num
v_accvgpr_write_b32 a[_a], 0
_a = _a + 1
.endr
.endm

.macro .v_clear_nc vid, num
_v = \vid
.rept \num
v_mov_b32 v[_v], 0
_v = _v + 1
.endr
.endm
.include "igemm_fwd_gtcx_nhwc_fp16_utils.inc"

;----------------------------------------------------------
; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,47 +23,9 @@
* SOFTWARE.
*
*******************************************************************************/
; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0)
; generated by igemm_codegen.py (32a41f791dcf0139e95f217f3905939fbbae794c)
;
.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp
s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer]
s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer]
s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift]
.endm

.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp
.mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp
s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot]
s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp]
.endm

.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp
v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer]
v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer]
v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp]
.endm

.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp
.mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp
v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot]
v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp]
.endm

.macro .v_clear_acc_c a, num
_a = \a
.rept \num
v_accvgpr_write_b32 a[_a], 0
_a = _a + 1
.endr
.endm

.macro .v_clear_nc vid, num
_v = \vid
.rept \num
v_mov_b32 v[_v], 0
_v = _v + 1
.endr
.endm
.include "igemm_fwd_gtcx_nhwc_fp16_utils.inc"

;----------------------------------------------------------
; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x4x1_1x4x1x64_gkgs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,47 +23,9 @@
* SOFTWARE.
*
*******************************************************************************/
; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0)
; generated by igemm_codegen.py (32a41f791dcf0139e95f217f3905939fbbae794c)
;
.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp
s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer]
s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer]
s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift]
.endm

.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp
.mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp
s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot]
s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp]
.endm

.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp
v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer]
v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer]
v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp]
.endm

.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp
.mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp
v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot]
v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp]
.endm

.macro .v_clear_acc_c a, num
_a = \a
.rept \num
v_accvgpr_write_b32 a[_a], 0
_a = _a + 1
.endr
.endm

.macro .v_clear_nc vid, num
_v = \vid
.rept \num
v_mov_b32 v[_v], 0
_v = _v + 1
.endr
.endm
.include "igemm_fwd_gtcx_nhwc_fp16_utils.inc"

;----------------------------------------------------------
; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,47 +23,9 @@
* SOFTWARE.
*
*******************************************************************************/
; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0)
; generated by igemm_codegen.py (32a41f791dcf0139e95f217f3905939fbbae794c)
;
.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp
s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer]
s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer]
s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift]
.endm

.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp
.mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp
s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot]
s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp]
.endm

.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp
v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer]
v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer]
v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp]
.endm

.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp
.mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp
v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot]
v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp]
.endm

.macro .v_clear_acc_c a, num
_a = \a
.rept \num
v_accvgpr_write_b32 a[_a], 0
_a = _a + 1
.endr
.endm

.macro .v_clear_nc vid, num
_v = \vid
.rept \num
v_mov_b32 v[_v], 0
_v = _v + 1
.endr
.endm
.include "igemm_fwd_gtcx_nhwc_fp16_utils.inc"

;----------------------------------------------------------
; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x32x32_wt64x16x4_ws1x1_wr1x1_ta1x4x4x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,47 +23,9 @@
* SOFTWARE.
*
*******************************************************************************/
; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0)
; generated by igemm_codegen.py (32a41f791dcf0139e95f217f3905939fbbae794c)
;
.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp
s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer]
s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer]
s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift]
.endm

.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp
.mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp
s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot]
s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp]
.endm

.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp
v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer]
v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer]
v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp]
.endm

.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp
.mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp
v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot]
v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp]
.endm

.macro .v_clear_acc_c a, num
_a = \a
.rept \num
v_accvgpr_write_b32 a[_a], 0
_a = _a + 1
.endr
.endm

.macro .v_clear_nc vid, num
_v = \vid
.rept \num
v_mov_b32 v[_v], 0
_v = _v + 1
.endr
.endm
.include "igemm_fwd_gtcx_nhwc_fp16_utils.inc"

;----------------------------------------------------------
; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,47 +23,9 @@
* SOFTWARE.
*
*******************************************************************************/
; generated by igemm_codegen.py (0ffceb844f35ce8d878168528ba6941b666ff7c0)
; generated by igemm_codegen.py (32a41f791dcf0139e95f217f3905939fbbae794c)
;
.macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp
s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer]
s_add_u32 s[\s_tmp], s[\s_tmp], s[\s_numer]
s_lshr_b32 s[\s_quot], s[\s_tmp], s[\s_shift]
.endm

.macro .mdiv_u32_rem_ss s_rem s_quot s_numer s_magic s_shift s_denom s_tmp
.mdiv_u32_ss \s_quot,\s_numer,\s_magic,\s_shift,\s_tmp
s_mul_i32 s[\s_tmp], s[\s_denom], s[\s_quot]
s_sub_u32 s[\s_rem], s[\s_numer], s[\s_tmp]
.endm

.macro .mdiv_u32_vs v_quot v_numer s_magic s_shift v_tmp
v_mul_hi_u32 v[\v_tmp], s[\s_magic], v[\v_numer]
v_add_u32 v[\v_tmp], v[\v_tmp], v[\v_numer]
v_lshrrev_b32 v[\v_quot], s[\s_shift], v[\v_tmp]
.endm

.macro .mdiv_u32_rem_vs v_rem v_quot v_numer s_magic s_shift s_denom v_tmp
.mdiv_u32_vs \v_quot,\v_numer,\s_magic,\s_shift,\v_tmp
v_mul_lo_u32 v[\v_tmp], s[\s_denom], v[\v_quot]
v_sub_u32 v[\v_rem], v[\v_numer], v[\v_tmp]
.endm

.macro .v_clear_acc_c a, num
_a = \a
.rept \num
v_accvgpr_write_b32 a[_a], 0
_a = _a + 1
.endr
.endm

.macro .v_clear_nc vid, num
_v = \vid
.rept \num
v_mov_b32 v[_v], 0
_v = _v + 1
.endr
.endm
.include "igemm_fwd_gtcx_nhwc_fp16_utils.inc"

;----------------------------------------------------------
; starting of kernel igemm_fwd_gtcx_nhwc_fp16_bx0_ex0_bt128x64x32_wt32x32x8_ws1x1_wr1x2_ta1x16x1x1_1x2x4x32_tb1x8x1x1_1x4x1x64_pta_gkgs
Expand Down
Loading

0 comments on commit 10a787b

Please sign in to comment.