diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x8x1_1x16x1x16.s index f99820497f..e482ebac89 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x8x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 63 -.set s_k_padded, 64 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 64 +.set s_k_padded, 65 .set s_tmp, 66 .set s_end, 72 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -683,8 +684,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -759,8 +760,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s index 6449e2f636..a1c26be851 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 63 -.set s_k_padded, 64 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 64 +.set s_k_padded, 65 .set s_tmp, 66 .set s_end, 72 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -685,8 +686,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -761,8 +762,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x1x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x1x1_1x16x1x16.s index f8e1ee4c8b..b72f8f5324 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x1x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x1x1_1x16x1x16.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_dim_b, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_dim_b, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 56 -.set s_k_padded, 57 -.set s_tmp, 58 -.set s_end, 64 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 57 +.set s_k_padded, 58 +.set s_tmp, 60 +.set s_end, 66 .set v_c, 0 ; coalescing:8, needed:0, resuable:33 .set v_a, 0 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x1x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -680,8 +681,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -750,8 +751,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -911,7 +912,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 72 - .amdhsa_next_free_sgpr 64 + .amdhsa_next_free_sgpr 66 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -922,7 +923,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x1x1_1x16x1x16 .symbol: igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x1x1_1x16x1x16.kd - .sgpr_count: 70 + .sgpr_count: 72 .vgpr_count: 72 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.s index d8a13f5f24..fb0bf78467 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_dim_b, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_dim_b, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 56 -.set s_k_padded, 57 -.set s_tmp, 58 -.set s_end, 64 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 57 +.set s_k_padded, 58 +.set s_tmp, 60 +.set s_end, 66 .set v_c, 0 ; coalescing:8, needed:0, resuable:33 .set v_a, 0 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x1x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -682,8 +683,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -752,8 +753,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -901,7 +902,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 72 - .amdhsa_next_free_sgpr 64 + .amdhsa_next_free_sgpr 66 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -912,7 +913,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.kd - .sgpr_count: 70 + .sgpr_count: 72 .vgpr_count: 72 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x16x1_1x16x1x16.s index 33fbc2ab4c..2362d57954 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x16x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 71 -.set s_k_padded, 72 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 72 +.set s_k_padded, 73 .set s_tmp, 74 .set s_end, 80 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -704,8 +705,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -789,8 +790,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s index fbc80306f1..f3d6b49c3f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 71 -.set s_k_padded, 72 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 72 +.set s_k_padded, 73 .set s_tmp, 74 .set s_end, 80 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -706,8 +707,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -791,8 +792,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x2x1_1x16x1x16.s index 55613123a1..53982225d9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x2x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 51 -.set s_sub_n, 57 -.set s_k_padded, 58 +.set s_in_offset, 52 +.set s_out_offset, 52 +.set s_sub_n, 58 +.set s_k_padded, 59 .set s_tmp, 60 .set s_end, 66 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x2x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -668,8 +669,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -738,8 +739,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_ v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s index 2658e0e4d1..0c3b374940 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 51 -.set s_sub_n, 57 -.set s_k_padded, 58 +.set s_in_offset, 52 +.set s_out_offset, 52 +.set s_sub_n, 58 +.set s_k_padded, 59 .set s_tmp, 60 .set s_end, 66 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x2x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -670,8 +671,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -740,8 +741,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_ v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x4x1_1x16x1x16.s index 5744fb3430..d6a4e2f07e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x4x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 59 -.set s_k_padded, 60 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 60 +.set s_k_padded, 61 .set s_tmp, 62 .set s_end, 68 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -679,8 +680,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -752,8 +753,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1 s_mov_b64 exec, -1 v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x0, step:0x1, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s index 27fb291ddc..65fcd925e7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 59 -.set s_k_padded, 60 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 60 +.set s_k_padded, 61 .set s_tmp, 62 .set s_end, 68 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -681,8 +682,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -754,8 +755,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1 s_mov_b64 exec, -1 v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x0, step:0x1, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s index a3af395c16..f0056ab5bd 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 57 -.set s_k_padded, 58 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 58 +.set s_k_padded, 59 .set s_tmp, 60 .set s_end, 66 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -676,8 +677,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -746,8 +747,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s index ac45b64192..d5308542b3 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 57 -.set s_k_padded, 58 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 58 +.set s_k_padded, 59 .set s_tmp, 60 .set s_end, 66 @@ -336,7 +337,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -678,8 +679,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -748,8 +749,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s index 290a769ab8..989c8f2954 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 65 -.set s_k_padded, 66 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 66 +.set s_k_padded, 67 .set s_tmp, 68 .set s_end, 74 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -701,8 +702,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -789,8 +790,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s index b018f8e7cd..16b7d50892 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 65 -.set s_k_padded, 66 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 66 +.set s_k_padded, 67 .set s_tmp, 68 .set s_end, 74 @@ -336,7 +337,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -703,8 +704,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -791,8 +792,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s index ae301e2f3a..57675b4b77 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 53 -.set s_k_padded, 54 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 54 +.set s_k_padded, 55 .set s_tmp, 56 .set s_end, 62 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -649,8 +650,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -715,8 +716,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s index 0a11e1b2b3..d49d64d91f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 53 -.set s_k_padded, 54 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 54 +.set s_k_padded, 55 .set s_tmp, 56 .set s_end, 62 @@ -336,7 +337,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -651,8 +652,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -717,8 +718,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x1x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x1x1_1x16x1x16.s index 648d8d69e7..7fd8252c6f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x1x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x1x1_1x16x1x16.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_dim_b, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_dim_b, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 64 -.set s_k_padded, 65 -.set s_tmp, 66 -.set s_end, 72 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 65 +.set s_k_padded, 66 +.set s_tmp, 68 +.set s_end, 74 .set v_c, 0 ; coalescing:16, needed:0, resuable:45 .set v_a, 0 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x1x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -695,8 +696,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -777,8 +778,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_ ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:128 ; load i_k:1 into local buffer 1, repeat 0 s_waitcnt lgkmcnt(3) v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -1040,7 +1041,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_ .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 84 - .amdhsa_next_free_sgpr 72 + .amdhsa_next_free_sgpr 74 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1051,7 +1052,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x1x1_1x16x1x16 .symbol: igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x1x1_1x16x1x16.kd - .sgpr_count: 78 + .sgpr_count: 80 .vgpr_count: 84 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.s index f16cf4068b..ad88b73359 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_dim_b, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_dim_b, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 64 -.set s_k_padded, 65 -.set s_tmp, 66 -.set s_end, 72 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 65 +.set s_k_padded, 66 +.set s_tmp, 68 +.set s_end, 74 .set v_c, 0 ; coalescing:16, needed:0, resuable:45 .set v_a, 0 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x1x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -697,8 +698,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -779,8 +780,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_ ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:128 ; load i_k:1 into local buffer 1, repeat 0 s_waitcnt lgkmcnt(3) v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -1018,7 +1019,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_ .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 84 - .amdhsa_next_free_sgpr 72 + .amdhsa_next_free_sgpr 74 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1029,7 +1030,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.kd - .sgpr_count: 78 + .sgpr_count: 80 .vgpr_count: 84 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x2x1_1x16x1x16.s index d171cdd629..5f467caa84 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x2x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 51 -.set s_sub_n, 65 -.set s_k_padded, 66 +.set s_in_offset, 52 +.set s_out_offset, 52 +.set s_sub_n, 66 +.set s_k_padded, 67 .set s_tmp, 68 .set s_end, 74 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x2x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -692,8 +693,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -771,8 +772,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_ s_mov_b64 exec, -1 v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x0, step:0x1, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s index 0621af0fda..97ef83e572 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 51 -.set s_sub_n, 65 -.set s_k_padded, 66 +.set s_in_offset, 52 +.set s_out_offset, 52 +.set s_sub_n, 66 +.set s_k_padded, 67 .set s_tmp, 68 .set s_end, 74 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x2x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -694,8 +695,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -773,8 +774,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_ s_mov_b64 exec, -1 v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x0, step:0x1, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x4x1_1x16x1x16.s index cf3a52b252..fa963e839f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x4x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 67 -.set s_k_padded, 68 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 68 +.set s_k_padded, 69 .set s_tmp, 70 .set s_end, 76 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -692,8 +693,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -772,8 +773,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s index f7aa54b561..d9f29d2eed 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 67 -.set s_k_padded, 68 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 68 +.set s_k_padded, 69 .set s_tmp, 70 .set s_end, 76 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -694,8 +695,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -774,8 +775,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x8x1_1x16x1x16.s index 0c51d5526d..e3ad487e90 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x8x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 57 -.set s_k_padded, 58 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 58 +.set s_k_padded, 59 .set s_tmp, 60 .set s_end, 66 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -665,8 +666,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -735,8 +736,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_ v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s index a8b18fd06a..7802dfb8f6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 57 -.set s_k_padded, 58 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 58 +.set s_k_padded, 59 .set s_tmp, 60 .set s_end, 66 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -667,8 +668,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -737,8 +738,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_ v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x16x1x16_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x16x1x16_tb1x1x16x1_1x16x1x16.s index 2d89cee238..80d15e20ce 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x16x1x16_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x16x1x16_tb1x1x16x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 65 -.set s_k_padded, 66 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 66 +.set s_k_padded, 67 .set s_tmp, 68 .set s_end, 74 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -700,8 +701,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -793,8 +794,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1 s_waitcnt lgkmcnt(5) v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+8:a_c+11] ; repeat:0x1, step:0x0, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s index 3d37f0407a..9463a6248e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 65 -.set s_k_padded, 66 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 66 +.set s_k_padded, 67 .set s_tmp, 68 .set s_end, 74 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -702,8 +703,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -795,8 +796,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1 s_waitcnt lgkmcnt(5) v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+8:a_c+11] ; repeat:0x1, step:0x0, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x2x1_1x16x1x16.s index b53a0bbf18..e303b06695 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x2x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 51 -.set s_sub_n, 51 -.set s_k_padded, 52 +.set s_in_offset, 52 +.set s_out_offset, 52 +.set s_sub_n, 52 +.set s_k_padded, 53 .set s_tmp, 54 .set s_end, 60 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x2x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -660,8 +661,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -724,8 +725,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s index 2cbb1530b3..fde9f8efd5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 51 -.set s_sub_n, 51 -.set s_k_padded, 52 +.set s_in_offset, 52 +.set s_out_offset, 52 +.set s_sub_n, 52 +.set s_k_padded, 53 .set s_tmp, 54 .set s_end, 60 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x2x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -662,8 +663,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -726,8 +727,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x4x1_1x16x1x16.s index 80940086f9..64f49abc43 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x4x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 53 -.set s_k_padded, 54 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 54 +.set s_k_padded, 55 .set s_tmp, 56 .set s_end, 62 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -665,8 +666,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -731,8 +732,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s index 43c11dbb7a..8a8f5fb313 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 53 -.set s_k_padded, 54 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 54 +.set s_k_padded, 55 .set s_tmp, 56 .set s_end, 62 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -667,8 +668,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -733,8 +734,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4.s index 98a6a02c4a..d82a633019 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 65 -.set s_k_padded, 66 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 66 +.set s_k_padded, 67 .set s_tmp, 68 .set s_end, 74 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x16x1x4 v_mov_b32 v[v_tmp], v0 @@ -677,8 +678,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -755,8 +756,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4_gkgs.s index e0da3a8b98..d42547ad9e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 65 -.set s_k_padded, 66 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 66 +.set s_k_padded, 67 .set s_tmp, 68 .set s_end, 74 @@ -336,7 +337,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x16x1x4 v_mov_b32 v[v_tmp], v0 @@ -679,8 +680,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -757,8 +758,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x8x1_1x16x1x16.s index 25f08384f2..c24904533b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x8x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 59 -.set s_k_padded, 60 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 60 +.set s_k_padded, 61 .set s_tmp, 62 .set s_end, 68 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -679,8 +680,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -752,8 +753,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1 s_mov_b64 exec, -1 v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+4:a_c+7] ; repeat:0x0, step:1x0, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s index f686f12736..e63befdd0d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 59 -.set s_k_padded, 60 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 60 +.set s_k_padded, 61 .set s_tmp, 62 .set s_end, 68 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -681,8 +682,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -754,8 +755,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1 s_mov_b64 exec, -1 v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+4:a_c+7] ; repeat:0x0, step:1x0, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x1x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x1x1_1x16x1x16.s index 448813e803..9fcfc02ad7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x1x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x1x1_1x16x1x16.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_dim_b, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_dim_b, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 52 -.set s_k_padded, 53 -.set s_tmp, 54 -.set s_end, 60 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 53 +.set s_k_padded, 54 +.set s_tmp, 56 +.set s_end, 62 .set v_c, 0 ; coalescing:4, needed:0, resuable:25 .set v_a, 0 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x1x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -653,8 +654,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -719,8 +720,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -845,7 +846,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 64 - .amdhsa_next_free_sgpr 60 + .amdhsa_next_free_sgpr 62 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -856,7 +857,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x1x1_1x16x1x16 .symbol: igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x1x1_1x16x1x16.kd - .sgpr_count: 66 + .sgpr_count: 68 .vgpr_count: 64 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.s index 76c5f9c305..43a55fe238 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_dim_b, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_dim_b, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 52 -.set s_k_padded, 53 -.set s_tmp, 54 -.set s_end, 60 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 53 +.set s_k_padded, 54 +.set s_tmp, 56 +.set s_end, 62 .set v_c, 0 ; coalescing:4, needed:0, resuable:25 .set v_a, 0 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x1x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -655,8 +656,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -721,8 +722,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -841,7 +842,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 64 - .amdhsa_next_free_sgpr 60 + .amdhsa_next_free_sgpr 62 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -852,7 +853,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.kd - .sgpr_count: 66 + .sgpr_count: 68 .vgpr_count: 64 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x16x1_1x16x1x16.s index f9e52623e0..d1c45fc61c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x16x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 67 -.set s_k_padded, 68 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 68 +.set s_k_padded, 69 .set s_tmp, 70 .set s_end, 76 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -702,8 +703,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -784,8 +785,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_ v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s index 521d3740e9..330c3d6876 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 67 -.set s_k_padded, 68 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 68 +.set s_k_padded, 69 .set s_tmp, 70 .set s_end, 76 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -704,8 +705,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -786,8 +787,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_ v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x2x1_1x16x1x16.s index b61821473e..848efc95e6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x2x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 51 -.set s_sub_n, 53 -.set s_k_padded, 54 +.set s_in_offset, 52 +.set s_out_offset, 52 +.set s_sub_n, 54 +.set s_k_padded, 55 .set s_tmp, 56 .set s_end, 62 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x2x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -662,8 +663,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -728,8 +729,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s index 05f8444506..46ca6f1e7d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 51 -.set s_sub_n, 53 -.set s_k_padded, 54 +.set s_in_offset, 52 +.set s_out_offset, 52 +.set s_sub_n, 54 +.set s_k_padded, 55 .set s_tmp, 56 .set s_end, 62 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x2x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -664,8 +665,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -730,8 +731,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x1x4_tb1x1x1x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x1x4_tb1x1x1x1_1x16x1x4.s index b98a0d6af4..be17c33646 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x1x4_tb1x1x1x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x1x4_tb1x1x1x1_1x16x1x4.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_dim_b, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_dim_b, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 64 -.set s_k_padded, 65 -.set s_tmp, 66 -.set s_end, 72 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 65 +.set s_k_padded, 66 +.set s_tmp, 68 +.set s_end, 74 .set v_c, 0 ; coalescing:4, needed:0, resuable:37 .set v_a, 0 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x1x1, cluster(n0,n1b,c0,c1e): 1x16x1x4 v_mov_b32 v[v_tmp], v0 @@ -683,8 +684,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -761,8 +762,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -899,7 +900,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 76 - .amdhsa_next_free_sgpr 72 + .amdhsa_next_free_sgpr 74 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -910,7 +911,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x1x4_tb1x1x1x1_1x16x1x4 .symbol: igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x1x4_tb1x1x1x1_1x16x1x4.kd - .sgpr_count: 78 + .sgpr_count: 80 .vgpr_count: 76 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x1x4_tb1x1x1x1_1x16x1x4_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x1x4_tb1x1x1x1_1x16x1x4_gkgs.s index 6c91e78ece..714f157817 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x1x4_tb1x1x1x1_1x16x1x4_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x1x4_tb1x1x1x1_1x16x1x4_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_dim_b, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_dim_b, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 64 -.set s_k_padded, 65 -.set s_tmp, 66 -.set s_end, 72 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 65 +.set s_k_padded, 66 +.set s_tmp, 68 +.set s_end, 74 .set v_c, 0 ; coalescing:4, needed:0, resuable:37 .set v_a, 0 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x1x1, cluster(n0,n1b,c0,c1e): 1x16x1x4 v_mov_b32 v[v_tmp], v0 @@ -685,8 +686,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -763,8 +764,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -895,7 +896,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 76 - .amdhsa_next_free_sgpr 72 + .amdhsa_next_free_sgpr 74 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -906,7 +907,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x1x4_tb1x1x1x1_1x16x1x4_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x1x4_tb1x1x1x1_1x16x1x4_gkgs.kd - .sgpr_count: 78 + .sgpr_count: 80 .vgpr_count: 76 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x4x1_1x16x1x16.s index 15146a093a..4ea1ec3d43 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x4x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 55 -.set s_k_padded, 56 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 56 +.set s_k_padded, 57 .set s_tmp, 58 .set s_end, 64 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -662,8 +663,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -730,8 +731,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s index 124363c94b..27098002c2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 55 -.set s_k_padded, 56 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 56 +.set s_k_padded, 57 .set s_tmp, 58 .set s_end, 64 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -664,8 +665,8 @@ igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -732,8 +733,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx16_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx1_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x1x16_1x32x1x8_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx1_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x1x16_1x32x1x8_gkgs.s index 83f36052b9..3bf892bd95 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx1_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x1x16_1x32x1x8_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx1_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x1x16_1x32x1x8_gkgs.s @@ -239,30 +239,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 63 -.set s_sub_n, 93 -.set s_k_padded, 94 +.set s_in_offset, 50 +.set s_out_offset, 64 +.set s_sub_n, 94 +.set s_k_padded, 95 .set s_tmp, 96 .set s_end, 102 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx1_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x1x1x32_1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x1x16, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x8x1_1x16x1x16.s index a34e551745..4ad5e2c533 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x8x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 71 -.set s_k_padded, 72 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 72 +.set s_k_padded, 73 .set s_tmp, 74 .set s_end, 80 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16_1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -707,8 +708,8 @@ igemm_wrw_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16_1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -792,8 +793,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s index 2887fad211..44f16273df 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 71 -.set s_k_padded, 72 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 72 +.set s_k_padded, 73 .set s_tmp, 74 .set s_end, 80 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16_1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -709,8 +710,8 @@ igemm_wrw_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16_1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -794,8 +795,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x2x1_1x4x1x64.s index 0f87b15234..3b84c96d2e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x2x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x2x1_1x4x1x64.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 50 -.set s_k_padded, 51 -.set s_tmp, 52 -.set s_end, 58 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 51 +.set s_k_padded, 52 +.set s_tmp, 54 +.set s_end, 60 .set v_c, 0 ; coalescing:32, needed:0, resuable:36 .set v_a, 0 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x2x1, cluster(n0,n1b,c0,c1e): 1x4x1x64 v_mov_b32 v[v_tmp], v0 @@ -1362,7 +1363,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_ .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 70 - .amdhsa_next_free_sgpr 58 + .amdhsa_next_free_sgpr 60 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1373,7 +1374,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x2x1_1x4x1x64 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x2x1_1x4x1x64.kd - .sgpr_count: 64 + .sgpr_count: 66 .vgpr_count: 70 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s index 4524bc3d3c..9d71c185c5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 50 -.set s_k_padded, 51 -.set s_tmp, 52 -.set s_end, 58 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 51 +.set s_k_padded, 52 +.set s_tmp, 54 +.set s_end, 60 .set v_c, 0 ; coalescing:16, needed:0, resuable:36 .set v_a, 0 @@ -329,7 +330,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x2x1, cluster(n0,n1b,c0,c1e): 1x4x1x64 v_mov_b32 v[v_tmp], v0 @@ -1280,7 +1281,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_ .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 70 - .amdhsa_next_free_sgpr 58 + .amdhsa_next_free_sgpr 60 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1291,7 +1292,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.kd - .sgpr_count: 64 + .sgpr_count: 66 .vgpr_count: 70 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x4x1_1x8x1x32.s index 2cda9d87c1..bc0f89e944 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x4x1_1x8x1x32.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 52 -.set s_sub_n, 54 -.set s_k_padded, 55 -.set s_tmp, 56 -.set s_end, 62 +.set s_in_offset, 51 +.set s_out_offset, 53 +.set s_sub_n, 55 +.set s_k_padded, 56 +.set s_tmp, 58 +.set s_end, 64 .set v_c, 0 ; coalescing:32, needed:0, resuable:44 .set v_a, 0 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x4x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -1422,7 +1423,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_ .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 78 - .amdhsa_next_free_sgpr 62 + .amdhsa_next_free_sgpr 64 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1433,7 +1434,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x4x1_1x8x1x32 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x4x1_1x8x1x32.kd - .sgpr_count: 68 + .sgpr_count: 70 .vgpr_count: 78 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s index 3c00594d54..6d3af66e10 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 52 -.set s_sub_n, 54 -.set s_k_padded, 55 -.set s_tmp, 56 -.set s_end, 62 +.set s_in_offset, 51 +.set s_out_offset, 53 +.set s_sub_n, 55 +.set s_k_padded, 56 +.set s_tmp, 58 +.set s_end, 64 .set v_c, 0 ; coalescing:16, needed:0, resuable:44 .set v_a, 0 @@ -329,7 +330,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x4x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -1340,7 +1341,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_ .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 78 - .amdhsa_next_free_sgpr 62 + .amdhsa_next_free_sgpr 64 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1351,7 +1352,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.kd - .sgpr_count: 68 + .sgpr_count: 70 .vgpr_count: 78 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x16x32_wt32x8x4_ws2x1_wr1x1_ta1x2x1x8_1x16x1x16_tb1x2x1x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x16x32_wt32x8x4_ws2x1_wr1x1_ta1x2x1x8_1x16x1x16_tb1x2x1x1_1x16x1x16.s index e342f77ed9..1c5d2d5c4c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x16x32_wt32x8x4_ws2x1_wr1x1_ta1x2x1x8_1x16x1x16_tb1x2x1x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x16x32_wt32x8x4_ws2x1_wr1x1_ta1x2x1x8_1x16x1x16_tb1x2x1x1_1x16x1x16.s @@ -238,30 +238,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 55 -.set s_k_padded, 56 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 56 +.set s_k_padded, 57 .set s_tmp, 58 .set s_end, 64 @@ -327,7 +328,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x16x32_wt32x8x4_ws2x1_wr1x1_ta1x2x1x8_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x1x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x16x32_wt32x8x4_ws2x1_wr1x1_ta1x2x1x8_1x16x1x16_tb1x2x1x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x16x32_wt32x8x4_ws2x1_wr1x1_ta1x2x1x8_1x16x1x16_tb1x2x1x1_1x16x1x16_gkgs.s index 6b682e8939..6ec11e646c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x16x32_wt32x8x4_ws2x1_wr1x1_ta1x2x1x8_1x16x1x16_tb1x2x1x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x16x32_wt32x8x4_ws2x1_wr1x1_ta1x2x1x8_1x16x1x16_tb1x2x1x1_1x16x1x16_gkgs.s @@ -239,30 +239,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 55 -.set s_k_padded, 56 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 56 +.set s_k_padded, 57 .set s_tmp, 58 .set s_end, 64 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x16x32_wt32x8x4_ws2x1_wr1x1_ta1x2x1x8_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x1x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x4x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x4x1_1x4x1x64.s index 10cddf3f64..5c87a2bf63 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x4x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x4x1_1x4x1x64.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 52 -.set s_sub_n, 52 -.set s_k_padded, 53 -.set s_tmp, 54 -.set s_end, 60 +.set s_in_offset, 51 +.set s_out_offset, 53 +.set s_sub_n, 53 +.set s_k_padded, 54 +.set s_tmp, 56 +.set s_end, 62 .set v_c, 0 ; coalescing:64, needed:16, resuable:48 .set v_a, 16 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x1x2_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x4x1, cluster(n0,n1b,c0,c1e): 1x4x1x64 v_mov_b32 v[v_tmp], v0 @@ -1978,7 +1979,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x1x2_ .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 128 - .amdhsa_next_free_sgpr 60 + .amdhsa_next_free_sgpr 62 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1989,7 +1990,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x4x1_1x4x1x64 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x4x1_1x4x1x64.kd - .sgpr_count: 66 + .sgpr_count: 68 .vgpr_count: 128 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s index 59cb246167..6b7ec6b33f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 52 -.set s_sub_n, 52 -.set s_k_padded, 53 -.set s_tmp, 54 -.set s_end, 60 +.set s_in_offset, 51 +.set s_out_offset, 53 +.set s_sub_n, 53 +.set s_k_padded, 54 +.set s_tmp, 56 +.set s_end, 62 .set v_c, 0 ; coalescing:32, needed:0, resuable:48 .set v_a, 0 @@ -329,7 +330,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x1x2_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x4x1, cluster(n0,n1b,c0,c1e): 1x4x1x64 v_mov_b32 v[v_tmp], v0 @@ -1800,7 +1801,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x1x2_ .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 128 - .amdhsa_next_free_sgpr 60 + .amdhsa_next_free_sgpr 62 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1811,7 +1812,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.kd - .sgpr_count: 66 + .sgpr_count: 68 .vgpr_count: 128 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x8x1_1x8x1x32.s index 1d225e36d6..a3b09372b7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x8x1_1x8x1x32.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 56 -.set s_sub_n, 58 -.set s_k_padded, 59 -.set s_tmp, 60 -.set s_end, 66 +.set s_in_offset, 51 +.set s_out_offset, 57 +.set s_sub_n, 59 +.set s_k_padded, 60 +.set s_tmp, 62 +.set s_end, 68 .set v_c, 0 ; coalescing:64, needed:4, resuable:60 .set v_a, 4 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x4x1x4_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x8x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -2069,7 +2070,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x4x1x4_ .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 128 - .amdhsa_next_free_sgpr 66 + .amdhsa_next_free_sgpr 68 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -2080,7 +2081,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x8x1_1x8x1x32 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x8x1_1x8x1x32.kd - .sgpr_count: 72 + .sgpr_count: 74 .vgpr_count: 128 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s index 6032250fdf..000d4897f4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 56 -.set s_sub_n, 58 -.set s_k_padded, 59 -.set s_tmp, 60 -.set s_end, 66 +.set s_in_offset, 51 +.set s_out_offset, 57 +.set s_sub_n, 59 +.set s_k_padded, 60 +.set s_tmp, 62 +.set s_end, 68 .set v_c, 0 ; coalescing:32, needed:0, resuable:60 .set v_a, 0 @@ -329,7 +330,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x4x1x4_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x8x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -1891,7 +1892,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x4x1x4_ .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 128 - .amdhsa_next_free_sgpr 66 + .amdhsa_next_free_sgpr 68 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1902,7 +1903,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.kd - .sgpr_count: 72 + .sgpr_count: 74 .vgpr_count: 128 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x2x1x4_1x8x1x32_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x2x1x4_1x8x1x32_tb1x2x1x1_1x8x1x32.s index bb19232575..08164c2c9b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x2x1x4_1x8x1x32_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x2x1x4_1x8x1x32_tb1x2x1x1_1x8x1x32.s @@ -238,30 +238,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 51 -.set s_k_padded, 52 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 52 +.set s_k_padded, 53 .set s_tmp, 54 .set s_end, 60 @@ -327,7 +328,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x2x1x4_1x8 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x1x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x2x1x4_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x2x1x4_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s index ed55d0b86a..eda8309c78 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x2x1x4_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x2x1x4_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s @@ -239,30 +239,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 51 -.set s_k_padded, 52 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 52 +.set s_k_padded, 53 .set s_tmp, 54 .set s_end, 60 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x2x1x4_1x8 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x1x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x32x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x32x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x1x1_1x8x1x32.s index 4453b20549..51f1971642 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x32x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x32x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x1x1_1x8x1x32.s @@ -238,30 +238,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 51 -.set s_k_padded, 52 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 52 +.set s_k_padded, 53 .set s_tmp, 54 .set s_end, 60 @@ -327,7 +328,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x32x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x4_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x1x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x32x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x32x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s index 8d34ffd306..005748a4c9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x32x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x32x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s @@ -239,30 +239,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 51 -.set s_k_padded, 52 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 52 +.set s_k_padded, 53 .set s_tmp, 54 .set s_end, 60 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x32x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x4_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x1x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x1x1_1x4x1x64.s index 0e79f7783c..4652ebdb81 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -238,30 +238,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 49 -.set s_k_padded, 50 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 50 +.set s_k_padded, 51 .set s_tmp, 52 .set s_end, 58 @@ -327,7 +328,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x1x2_1x4x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x1x1, cluster(n0,n1b,c0,c1e): 1x4x1x64 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s index c57ba7fef0..c23d67c221 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x1x2_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s @@ -239,30 +239,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 49 -.set s_k_padded, 50 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 50 +.set s_k_padded, 51 .set s_tmp, 52 .set s_end, 58 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x1x2_1x4x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x1x1, cluster(n0,n1b,c0,c1e): 1x4x1x64 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x2x1_1x8x1x32.s index ce0ae4ea28..b0e4478738 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x2x1_1x8x1x32.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 52 -.set s_k_padded, 53 -.set s_tmp, 54 -.set s_end, 60 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 53 +.set s_k_padded, 54 +.set s_tmp, 56 +.set s_end, 62 .set v_c, 0 ; coalescing:32, needed:0, resuable:48 .set v_a, 0 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x4x1x4_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x2x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -1079,7 +1080,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x4x1x4_ .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 82 - .amdhsa_next_free_sgpr 60 + .amdhsa_next_free_sgpr 62 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1090,7 +1091,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x2x1_1x8x1x32 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x2x1_1x8x1x32.kd - .sgpr_count: 66 + .sgpr_count: 68 .vgpr_count: 82 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s index adef2cac1a..e633cd6294 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 52 -.set s_k_padded, 53 -.set s_tmp, 54 -.set s_end, 60 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 53 +.set s_k_padded, 54 +.set s_tmp, 56 +.set s_end, 62 .set v_c, 0 ; coalescing:16, needed:0, resuable:48 .set v_a, 0 @@ -329,7 +330,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x4x1x4_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x2x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -1039,7 +1040,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x4x1x4_ .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 82 - .amdhsa_next_free_sgpr 60 + .amdhsa_next_free_sgpr 62 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1050,7 +1051,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x4x1x4_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.kd - .sgpr_count: 66 + .sgpr_count: 68 .vgpr_count: 82 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x2x1x1_1x16x1x16_tb1x2x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x2x1x1_1x16x1x16_tb1x2x8x1_1x16x1x16.s index 4de2ecdbfa..bac3083eed 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x2x1x1_1x16x1x16_tb1x2x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x2x1x1_1x16x1x16_tb1x2x8x1_1x16x1x16.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 56 -.set s_sub_n, 56 -.set s_k_padded, 57 -.set s_tmp, 58 -.set s_end, 64 +.set s_in_offset, 51 +.set s_out_offset, 57 +.set s_sub_n, 57 +.set s_k_padded, 58 +.set s_tmp, 60 +.set s_end, 66 .set v_c, 0 ; coalescing:8, needed:0, resuable:33 .set v_a, 0 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x2x1x1_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x8x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -886,7 +887,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x2x1x1_1x .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 66 - .amdhsa_next_free_sgpr 64 + .amdhsa_next_free_sgpr 66 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -897,7 +898,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x2x1x1_1x16x1x16_tb1x2x8x1_1x16x1x16 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x2x1x1_1x16x1x16_tb1x2x8x1_1x16x1x16.kd - .sgpr_count: 70 + .sgpr_count: 72 .vgpr_count: 66 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x2x1x1_1x16x1x16_tb1x2x8x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x2x1x1_1x16x1x16_tb1x2x8x1_1x16x1x16_gkgs.s index 6665d8e4a2..07b17b05a2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x2x1x1_1x16x1x16_tb1x2x8x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x2x1x1_1x16x1x16_tb1x2x8x1_1x16x1x16_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 56 -.set s_sub_n, 56 -.set s_k_padded, 57 -.set s_tmp, 58 -.set s_end, 64 +.set s_in_offset, 51 +.set s_out_offset, 57 +.set s_sub_n, 57 +.set s_k_padded, 58 +.set s_tmp, 60 +.set s_end, 66 .set v_c, 0 ; coalescing:8, needed:0, resuable:33 .set v_a, 0 @@ -329,7 +330,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x2x1x1_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x8x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -876,7 +877,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x2x1x1_1x .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 66 - .amdhsa_next_free_sgpr 64 + .amdhsa_next_free_sgpr 66 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -887,7 +888,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x2x1x1_1x16x1x16_tb1x2x8x1_1x16x1x16_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x2x1x1_1x16x1x16_tb1x2x8x1_1x16x1x16_gkgs.kd - .sgpr_count: 70 + .sgpr_count: 72 .vgpr_count: 66 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x2x1x1_1x16x1x16_tb1x2x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x2x1x1_1x16x1x16_tb1x2x16x1_1x16x1x16.s index c75f88ab58..aa05fc3f7c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x2x1x1_1x16x1x16_tb1x2x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x2x1x1_1x16x1x16_tb1x2x16x1_1x16x1x16.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 64 -.set s_sub_n, 64 -.set s_k_padded, 65 -.set s_tmp, 66 -.set s_end, 72 +.set s_in_offset, 51 +.set s_out_offset, 65 +.set s_sub_n, 65 +.set s_k_padded, 66 +.set s_tmp, 68 +.set s_end, 74 .set v_c, 0 ; coalescing:16, needed:0, resuable:45 .set v_a, 0 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x2x1x1_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x16x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -1112,7 +1113,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x2x1x1_1x .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 78 - .amdhsa_next_free_sgpr 72 + .amdhsa_next_free_sgpr 74 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1123,7 +1124,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x2x1x1_1x16x1x16_tb1x2x16x1_1x16x1x16 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x2x1x1_1x16x1x16_tb1x2x16x1_1x16x1x16.kd - .sgpr_count: 78 + .sgpr_count: 80 .vgpr_count: 78 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x2x1x1_1x16x1x16_tb1x2x16x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x2x1x1_1x16x1x16_tb1x2x16x1_1x16x1x16_gkgs.s index 80d2b05748..02dee72244 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x2x1x1_1x16x1x16_tb1x2x16x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x2x1x1_1x16x1x16_tb1x2x16x1_1x16x1x16_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 64 -.set s_sub_n, 64 -.set s_k_padded, 65 -.set s_tmp, 66 -.set s_end, 72 +.set s_in_offset, 51 +.set s_out_offset, 65 +.set s_sub_n, 65 +.set s_k_padded, 66 +.set s_tmp, 68 +.set s_end, 74 .set v_c, 0 ; coalescing:16, needed:0, resuable:45 .set v_a, 0 @@ -329,7 +330,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x2x1x1_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x16x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -1090,7 +1091,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x2x1x1_1x .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 78 - .amdhsa_next_free_sgpr 72 + .amdhsa_next_free_sgpr 74 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1101,7 +1102,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x2x1x1_1x16x1x16_tb1x2x16x1_1x16x1x16_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x2x1x1_1x16x1x16_tb1x2x16x1_1x16x1x16_gkgs.kd - .sgpr_count: 78 + .sgpr_count: 80 .vgpr_count: 78 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1x16_tb1x2x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1x16_tb1x2x4x1_1x16x1x16.s index 74be39da92..9ada9013a7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1x16_tb1x2x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1x16_tb1x2x4x1_1x16x1x16.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 52 -.set s_sub_n, 52 -.set s_k_padded, 53 -.set s_tmp, 54 -.set s_end, 60 +.set s_in_offset, 51 +.set s_out_offset, 53 +.set s_sub_n, 53 +.set s_k_padded, 54 +.set s_tmp, 56 +.set s_end, 62 .set v_c, 0 ; coalescing:4, needed:0, resuable:25 .set v_a, 0 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x4x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -807,7 +808,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x1 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 58 - .amdhsa_next_free_sgpr 60 + .amdhsa_next_free_sgpr 62 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -818,7 +819,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1x16_tb1x2x4x1_1x16x1x16 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1x16_tb1x2x4x1_1x16x1x16.kd - .sgpr_count: 66 + .sgpr_count: 68 .vgpr_count: 58 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1x16_tb1x2x4x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1x16_tb1x2x4x1_1x16x1x16_gkgs.s index 945c633bf2..54eaeba89e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1x16_tb1x2x4x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1x16_tb1x2x4x1_1x16x1x16_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 52 -.set s_sub_n, 52 -.set s_k_padded, 53 -.set s_tmp, 54 -.set s_end, 60 +.set s_in_offset, 51 +.set s_out_offset, 53 +.set s_sub_n, 53 +.set s_k_padded, 54 +.set s_tmp, 56 +.set s_end, 62 .set v_c, 0 ; coalescing:4, needed:0, resuable:25 .set v_a, 0 @@ -329,7 +330,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x4x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -803,7 +804,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x1 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 58 - .amdhsa_next_free_sgpr 60 + .amdhsa_next_free_sgpr 62 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -814,7 +815,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1x16_tb1x2x4x1_1x16x1x16_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1x16_tb1x2x4x1_1x16x1x16_gkgs.kd - .sgpr_count: 66 + .sgpr_count: 68 .vgpr_count: 58 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x4x1x4_1x4x1x64_tb1x4x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x4x1x4_1x4x1x64_tb1x4x2x1_1x4x1x64.s index 9ab5e3d3ea..630464a70f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x4x1x4_1x4x1x64_tb1x4x2x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x4x1x4_1x4x1x64_tb1x4x2x1_1x4x1x64.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 52 -.set s_k_padded, 53 -.set s_tmp, 54 -.set s_end, 60 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 53 +.set s_k_padded, 54 +.set s_tmp, 56 +.set s_end, 62 .set v_c, 0 ; coalescing:64, needed:16, resuable:48 .set v_a, 16 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x4x1x4_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x2x1, cluster(n0,n1b,c0,c1e): 1x4x1x64 v_mov_b32 v[v_tmp], v0 @@ -1981,7 +1982,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x4x1x4_ .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 128 - .amdhsa_next_free_sgpr 60 + .amdhsa_next_free_sgpr 62 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1992,7 +1993,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x4x1x4_1x4x1x64_tb1x4x2x1_1x4x1x64 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x4x1x4_1x4x1x64_tb1x4x2x1_1x4x1x64.kd - .sgpr_count: 66 + .sgpr_count: 68 .vgpr_count: 128 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x4x1x4_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x4x1x4_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s index 251ab804ce..e8fd20e7f9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x4x1x4_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x4x1x4_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 52 -.set s_k_padded, 53 -.set s_tmp, 54 -.set s_end, 60 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 53 +.set s_k_padded, 54 +.set s_tmp, 56 +.set s_end, 62 .set v_c, 0 ; coalescing:32, needed:0, resuable:48 .set v_a, 0 @@ -329,7 +330,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x4x1x4_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x2x1, cluster(n0,n1b,c0,c1e): 1x4x1x64 v_mov_b32 v[v_tmp], v0 @@ -1803,7 +1804,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x4x1x4_ .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 128 - .amdhsa_next_free_sgpr 60 + .amdhsa_next_free_sgpr 62 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1814,7 +1815,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x4x1x4_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x4x1x4_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.kd - .sgpr_count: 66 + .sgpr_count: 68 .vgpr_count: 128 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x4x1_1x8x1x32.s index 00120b6287..200d6ff907 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x4x1_1x8x1x32.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 52 -.set s_sub_n, 58 -.set s_k_padded, 59 -.set s_tmp, 60 -.set s_end, 66 +.set s_in_offset, 51 +.set s_out_offset, 53 +.set s_sub_n, 59 +.set s_k_padded, 60 +.set s_tmp, 62 +.set s_end, 68 .set v_c, 0 ; coalescing:64, needed:4, resuable:60 .set v_a, 4 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x4x1x8_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x4x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -2072,7 +2073,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x4x1x8_ .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 128 - .amdhsa_next_free_sgpr 66 + .amdhsa_next_free_sgpr 68 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -2083,7 +2084,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x4x1_1x8x1x32 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x4x1_1x8x1x32.kd - .sgpr_count: 72 + .sgpr_count: 74 .vgpr_count: 128 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s index 4f9bc58be2..b9ff7d19c5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 52 -.set s_sub_n, 58 -.set s_k_padded, 59 -.set s_tmp, 60 -.set s_end, 66 +.set s_in_offset, 51 +.set s_out_offset, 53 +.set s_sub_n, 59 +.set s_k_padded, 60 +.set s_tmp, 62 +.set s_end, 68 .set v_c, 0 ; coalescing:32, needed:0, resuable:60 .set v_a, 0 @@ -329,7 +330,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x4x1x8_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x4x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -1894,7 +1895,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x4x1x8_ .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 128 - .amdhsa_next_free_sgpr 66 + .amdhsa_next_free_sgpr 68 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1905,7 +1906,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.kd - .sgpr_count: 72 + .sgpr_count: 74 .vgpr_count: 128 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x16x32_wt64x4x4_ws1x1_wr2x2_ta1x2x1x16_1x16x1x16_tb1x2x1x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x16x32_wt64x4x4_ws1x1_wr2x2_ta1x2x1x16_1x16x1x16_tb1x2x1x1_1x16x1x16.s index 73980289dd..dd1e98f1af 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x16x32_wt64x4x4_ws1x1_wr2x2_ta1x2x1x16_1x16x1x16_tb1x2x1x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x16x32_wt64x4x4_ws1x1_wr2x2_ta1x2x1x16_1x16x1x16_tb1x2x1x1_1x16x1x16.s @@ -238,30 +238,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 63 -.set s_k_padded, 64 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 64 +.set s_k_padded, 65 .set s_tmp, 66 .set s_end, 72 @@ -327,7 +328,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x16x32_wt64x4x4_ws1x1_wr2x2_ta1x2x1x16_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x1x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x16x32_wt64x4x4_ws1x1_wr2x2_ta1x2x1x16_1x16x1x16_tb1x2x1x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x16x32_wt64x4x4_ws1x1_wr2x2_ta1x2x1x16_1x16x1x16_tb1x2x1x1_1x16x1x16_gkgs.s index c932dc4915..0fcf5aa598 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x16x32_wt64x4x4_ws1x1_wr2x2_ta1x2x1x16_1x16x1x16_tb1x2x1x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x16x32_wt64x4x4_ws1x1_wr2x2_ta1x2x1x16_1x16x1x16_tb1x2x1x1_1x16x1x16_gkgs.s @@ -239,30 +239,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 63 -.set s_k_padded, 64 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 64 +.set s_k_padded, 65 .set s_tmp, 66 .set s_end, 72 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x16x32_wt64x4x4_ws1x1_wr2x2_ta1x2x1x16_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x1x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x2x1x8_1x8x1x32_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x2x1x8_1x8x1x32_tb1x2x1x1_1x8x1x32.s index b7c882ada2..cbdff7a1f9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x2x1x8_1x8x1x32_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x2x1x8_1x8x1x32_tb1x2x1x1_1x8x1x32.s @@ -238,30 +238,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 55 -.set s_k_padded, 56 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 56 +.set s_k_padded, 57 .set s_tmp, 58 .set s_end, 64 @@ -327,7 +328,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x2x1x8_1x8x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x1x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x2x1x8_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x2x1x8_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s index a0a9c7b196..c82488e9ea 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x2x1x8_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x2x1x8_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s @@ -239,30 +239,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 55 -.set s_k_padded, 56 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 56 +.set s_k_padded, 57 .set s_tmp, 58 .set s_end, 64 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x2x1x8_1x8x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x1x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x32x32_wt64x4x4_ws1x2_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x32x32_wt64x4x4_ws1x2_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x1x1_1x8x1x32.s index 904c5f4f31..c3be4227e3 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x32x32_wt64x4x4_ws1x2_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x32x32_wt64x4x4_ws1x2_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x1x1_1x8x1x32.s @@ -238,30 +238,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 55 -.set s_k_padded, 56 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 56 +.set s_k_padded, 57 .set s_tmp, 58 .set s_end, 64 @@ -327,7 +328,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x32x32_wt64x4x4_ws1x2_wr2x2_ta1x4x1x8_1x8x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x1x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x32x32_wt64x4x4_ws1x2_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x32x32_wt64x4x4_ws1x2_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s index d01a746bae..149f0f76ed 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x32x32_wt64x4x4_ws1x2_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x32x32_wt64x4x4_ws1x2_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s @@ -239,30 +239,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 55 -.set s_k_padded, 56 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 56 +.set s_k_padded, 57 .set s_tmp, 58 .set s_end, 64 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x32x32_wt64x4x4_ws1x2_wr2x2_ta1x4x1x8_1x8x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x1x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_1x4x1x64_tb1x4x1x1_1x4x1x64.s index 5e498fca26..d83173eb7b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_1x4x1x64_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -238,30 +238,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 51 -.set s_k_padded, 52 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 52 +.set s_k_padded, 53 .set s_tmp, 54 .set s_end, 60 @@ -327,7 +328,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_1x4 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x1x1, cluster(n0,n1b,c0,c1e): 1x4x1x64 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s index 0215728069..ef654c9686 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s @@ -239,30 +239,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 51 -.set s_k_padded, 52 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 52 +.set s_k_padded, 53 .set s_tmp, 54 .set s_end, 60 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x4_1x4 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x1x1, cluster(n0,n1b,c0,c1e): 1x4x1x64 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x2x1_1x8x1x32.s index 14b64118a2..af7000321e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x2x1_1x8x1x32.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 56 -.set s_k_padded, 57 -.set s_tmp, 58 -.set s_end, 64 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 57 +.set s_k_padded, 58 +.set s_tmp, 60 +.set s_end, 66 .set v_c, 0 ; coalescing:64, needed:16, resuable:48 .set v_a, 16 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x8_1x8 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x2x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -1424,7 +1425,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x8_1 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 98 - .amdhsa_next_free_sgpr 64 + .amdhsa_next_free_sgpr 66 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1435,7 +1436,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x2x1_1x8x1x32 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x2x1_1x8x1x32.kd - .sgpr_count: 70 + .sgpr_count: 72 .vgpr_count: 98 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s index dd68e4501a..b7c9f93efd 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 56 -.set s_k_padded, 57 -.set s_tmp, 58 -.set s_end, 64 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 57 +.set s_k_padded, 58 +.set s_tmp, 60 +.set s_end, 66 .set v_c, 0 ; coalescing:32, needed:0, resuable:48 .set v_a, 0 @@ -329,7 +330,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x8_1x8 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x2x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -1336,7 +1337,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x8_1 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 82 - .amdhsa_next_free_sgpr 64 + .amdhsa_next_free_sgpr 66 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1347,7 +1348,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.kd - .sgpr_count: 70 + .sgpr_count: 72 .vgpr_count: 82 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x4x1_1x8x1x32.s index d7c10e54bc..84d040b59c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x4x1_1x8x1x32.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 52 -.set s_sub_n, 52 -.set s_k_padded, 53 -.set s_tmp, 54 -.set s_end, 60 +.set s_in_offset, 51 +.set s_out_offset, 53 +.set s_sub_n, 53 +.set s_k_padded, 54 +.set s_tmp, 56 +.set s_end, 62 .set v_c, 0 ; coalescing:16, needed:0, resuable:25 .set v_a, 0 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x2x1x1_1x8 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x4x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -892,7 +893,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x2x1x1_1 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 58 - .amdhsa_next_free_sgpr 60 + .amdhsa_next_free_sgpr 62 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -903,7 +904,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x4x1_1x8x1x32 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x4x1_1x8x1x32.kd - .sgpr_count: 66 + .sgpr_count: 68 .vgpr_count: 58 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x4x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x4x1_1x8x1x32_gkgs.s index 8690536019..6b4c16f715 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x4x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x4x1_1x8x1x32_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 52 -.set s_sub_n, 52 -.set s_k_padded, 53 -.set s_tmp, 54 -.set s_end, 60 +.set s_in_offset, 51 +.set s_out_offset, 53 +.set s_sub_n, 53 +.set s_k_padded, 54 +.set s_tmp, 56 +.set s_end, 62 .set v_c, 0 ; coalescing:16, needed:0, resuable:25 .set v_a, 0 @@ -329,7 +330,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x2x1x1_1x8 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x4x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -870,7 +871,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x2x1x1_1 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 58 - .amdhsa_next_free_sgpr 60 + .amdhsa_next_free_sgpr 62 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -881,7 +882,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x4x1_1x8x1x32_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x4x1_1x8x1x32_gkgs.kd - .sgpr_count: 66 + .sgpr_count: 68 .vgpr_count: 58 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.s index 0d294583d5..a1e2daad0f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 52 -.set s_sub_n, 52 -.set s_k_padded, 53 -.set s_tmp, 54 -.set s_end, 60 +.set s_in_offset, 51 +.set s_out_offset, 53 +.set s_sub_n, 53 +.set s_k_padded, 54 +.set s_tmp, 56 +.set s_end, 62 .set v_c, 0 ; coalescing:16, needed:0, resuable:38 .set v_a, 0 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x1_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x4x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -901,7 +902,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x1_ .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 72 - .amdhsa_next_free_sgpr 60 + .amdhsa_next_free_sgpr 62 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -912,7 +913,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32.kd - .sgpr_count: 66 + .sgpr_count: 68 .vgpr_count: 72 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s index c851835032..1085ce7569 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 52 -.set s_sub_n, 52 -.set s_k_padded, 53 -.set s_tmp, 54 -.set s_end, 60 +.set s_in_offset, 51 +.set s_out_offset, 53 +.set s_sub_n, 53 +.set s_k_padded, 54 +.set s_tmp, 56 +.set s_end, 62 .set v_c, 0 ; coalescing:16, needed:0, resuable:38 .set v_a, 0 @@ -329,7 +330,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x1_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x4x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -879,7 +880,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x1_ .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 72 - .amdhsa_next_free_sgpr 60 + .amdhsa_next_free_sgpr 62 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -890,7 +891,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x1_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.kd - .sgpr_count: 66 + .sgpr_count: 68 .vgpr_count: 72 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x2x1x1_1x8x1x32_tb1x2x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x2x1x1_1x8x1x32_tb1x2x8x1_1x8x1x32.s index 93f89a846c..4438308599 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x2x1x1_1x8x1x32_tb1x2x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x2x1x1_1x8x1x32_tb1x2x8x1_1x8x1x32.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 56 -.set s_sub_n, 56 -.set s_k_padded, 57 -.set s_tmp, 58 -.set s_end, 64 +.set s_in_offset, 51 +.set s_out_offset, 57 +.set s_sub_n, 57 +.set s_k_padded, 58 +.set s_tmp, 60 +.set s_end, 66 .set v_c, 0 ; coalescing:32, needed:0, resuable:45 .set v_a, 0 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x2x1x1_1x8x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x8x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -1144,7 +1145,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x2x1x1_1x .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 78 - .amdhsa_next_free_sgpr 64 + .amdhsa_next_free_sgpr 66 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1155,7 +1156,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x2x1x1_1x8x1x32_tb1x2x8x1_1x8x1x32 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x2x1x1_1x8x1x32_tb1x2x8x1_1x8x1x32.kd - .sgpr_count: 70 + .sgpr_count: 72 .vgpr_count: 78 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x2x1x1_1x8x1x32_tb1x2x8x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x2x1x1_1x8x1x32_tb1x2x8x1_1x8x1x32_gkgs.s index 08393f8643..c3b12600b5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x2x1x1_1x8x1x32_tb1x2x8x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x2x1x1_1x8x1x32_tb1x2x8x1_1x8x1x32_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 56 -.set s_sub_n, 56 -.set s_k_padded, 57 -.set s_tmp, 58 -.set s_end, 64 +.set s_in_offset, 51 +.set s_out_offset, 57 +.set s_sub_n, 57 +.set s_k_padded, 58 +.set s_tmp, 60 +.set s_end, 66 .set v_c, 0 ; coalescing:16, needed:0, resuable:45 .set v_a, 0 @@ -329,7 +330,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x2x1x1_1x8x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x8x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -1104,7 +1105,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x2x1x1_1x .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 78 - .amdhsa_next_free_sgpr 64 + .amdhsa_next_free_sgpr 66 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1115,7 +1116,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x2x1x1_1x8x1x32_tb1x2x8x1_1x8x1x32_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x2x1x1_1x8x1x32_tb1x2x8x1_1x8x1x32_gkgs.kd - .sgpr_count: 70 + .sgpr_count: 72 .vgpr_count: 78 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.s index 4dde7da374..07809b8fc0 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 56 -.set s_sub_n, 56 -.set s_k_padded, 57 -.set s_tmp, 58 -.set s_end, 64 +.set s_in_offset, 51 +.set s_out_offset, 57 +.set s_sub_n, 57 +.set s_k_padded, 58 +.set s_tmp, 60 +.set s_end, 66 .set v_c, 0 ; coalescing:32, needed:0, resuable:54 .set v_a, 0 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x4x1x1_1x8x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x8x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -1292,7 +1293,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x4x1x1_1x .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 88 - .amdhsa_next_free_sgpr 64 + .amdhsa_next_free_sgpr 66 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1303,7 +1304,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32.kd - .sgpr_count: 70 + .sgpr_count: 72 .vgpr_count: 88 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s index f7d912d62c..aabe7b61e9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 56 -.set s_sub_n, 56 -.set s_k_padded, 57 -.set s_tmp, 58 -.set s_end, 64 +.set s_in_offset, 51 +.set s_out_offset, 57 +.set s_sub_n, 57 +.set s_k_padded, 58 +.set s_tmp, 60 +.set s_end, 66 .set v_c, 0 ; coalescing:32, needed:0, resuable:54 .set v_a, 0 @@ -329,7 +330,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x4x1x1_1x8x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x8x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -1246,7 +1247,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x4x1x1_1x .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 88 - .amdhsa_next_free_sgpr 64 + .amdhsa_next_free_sgpr 66 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1257,7 +1258,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x4x1x1_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.kd - .sgpr_count: 70 + .sgpr_count: 72 .vgpr_count: 88 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s index d27c1de5bc..32bb58bb87 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s @@ -238,30 +238,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 49 -.set s_k_padded, 50 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 50 +.set s_k_padded, 51 .set s_tmp, 52 .set s_end, 58 @@ -327,7 +328,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x2x1x1_1x8x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x1x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s index 4cbb518944..eee71573f0 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s @@ -239,30 +239,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 49 -.set s_k_padded, 50 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 50 +.set s_k_padded, 51 .set s_tmp, 52 .set s_end, 58 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x2x1x1_1x8x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x1x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x1x1x4_1x32x1x8_tb1x1x4x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x1x1x4_1x32x1x8_tb1x1x4x1_1x32x1x8.s index ab29a2b007..b5a6ac3c27 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x1x1x4_1x32x1x8_tb1x1x4x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x1x1x4_1x32x1x8_tb1x1x4x1_1x32x1x8.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 52 -.set s_sub_n, 54 -.set s_k_padded, 55 -.set s_tmp, 56 -.set s_end, 62 +.set s_in_offset, 51 +.set s_out_offset, 53 +.set s_sub_n, 55 +.set s_k_padded, 56 +.set s_tmp, 58 +.set s_end, 64 .set v_c, 0 ; coalescing:4, needed:0, resuable:28 .set v_a, 0 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x1x1x4_1x3 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -787,7 +788,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x1x1x4_1 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 62 - .amdhsa_next_free_sgpr 62 + .amdhsa_next_free_sgpr 64 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -798,7 +799,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x1x1x4_1x32x1x8_tb1x1x4x1_1x32x1x8 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x1x1x4_1x32x1x8_tb1x1x4x1_1x32x1x8.kd - .sgpr_count: 68 + .sgpr_count: 70 .vgpr_count: 62 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s index 6b139c36ec..a3b21e7b01 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x1_1x8x1x32.s @@ -238,30 +238,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 49 -.set s_k_padded, 50 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 50 +.set s_k_padded, 51 .set s_tmp, 52 .set s_end, 58 @@ -327,7 +328,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x4x1x1_1x8 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x1x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s index 7f51c761e0..94891c7178 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s @@ -239,30 +239,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 49 -.set s_k_padded, 50 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 50 +.set s_k_padded, 51 .set s_tmp, 52 .set s_end, 58 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x4x1x1_1x8 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x1x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x2x1_1x8x1x32.s index 06280f0abc..f5b0e8ad26 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x2x1_1x8x1x32.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 50 -.set s_k_padded, 51 -.set s_tmp, 52 -.set s_end, 58 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 51 +.set s_k_padded, 52 +.set s_tmp, 54 +.set s_end, 60 .set v_c, 0 ; coalescing:8, needed:0, resuable:27 .set v_a, 0 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x2x1x1_1x8x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x2x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -831,7 +832,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x2x1x1_1x8 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 60 - .amdhsa_next_free_sgpr 58 + .amdhsa_next_free_sgpr 60 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -842,7 +843,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x2x1_1x8x1x32 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x2x1_1x8x1x32.kd - .sgpr_count: 64 + .sgpr_count: 66 .vgpr_count: 60 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x2x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x2x1_1x8x1x32_gkgs.s index 56ca1fdc57..a234995868 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x2x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x2x1_1x8x1x32_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 50 -.set s_k_padded, 51 -.set s_tmp, 52 -.set s_end, 58 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 51 +.set s_k_padded, 52 +.set s_tmp, 54 +.set s_end, 60 .set v_c, 0 ; coalescing:8, needed:0, resuable:27 .set v_a, 0 @@ -329,7 +330,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x2x1x1_1x8x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x2x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -821,7 +822,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x2x1x1_1x8 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 60 - .amdhsa_next_free_sgpr 58 + .amdhsa_next_free_sgpr 60 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -832,7 +833,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x2x1_1x8x1x32_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x2x1x1_1x8x1x32_tb1x2x2x1_1x8x1x32_gkgs.kd - .sgpr_count: 64 + .sgpr_count: 66 .vgpr_count: 60 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s index 26c13c8e52..a2476ac3db 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 50 -.set s_k_padded, 51 -.set s_tmp, 52 -.set s_end, 58 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 51 +.set s_k_padded, 52 +.set s_tmp, 54 +.set s_end, 60 .set v_c, 0 ; coalescing:8, needed:0, resuable:30 .set v_a, 0 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x4x1x1_1x8x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x2x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -873,7 +874,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x4x1x1_1x8 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 64 - .amdhsa_next_free_sgpr 58 + .amdhsa_next_free_sgpr 60 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -884,7 +885,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32.kd - .sgpr_count: 64 + .sgpr_count: 66 .vgpr_count: 64 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s index cb971ea832..73d556c78e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 50 -.set s_k_padded, 51 -.set s_tmp, 52 -.set s_end, 58 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 51 +.set s_k_padded, 52 +.set s_tmp, 54 +.set s_end, 60 .set v_c, 0 ; coalescing:8, needed:0, resuable:30 .set v_a, 0 @@ -329,7 +330,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x4x1x1_1x8x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x2x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -863,7 +864,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x4x1x1_1x8 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 64 - .amdhsa_next_free_sgpr 58 + .amdhsa_next_free_sgpr 60 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -874,7 +875,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x4x1x1_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.kd - .sgpr_count: 64 + .sgpr_count: 66 .vgpr_count: 64 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1x4_tb1x2x16x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1x4_tb1x2x16x1_1x16x1x4.s index f149d83b25..7dfd3de1c9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1x4_tb1x2x16x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1x4_tb1x2x16x1_1x16x1x4.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 64 -.set s_sub_n, 64 -.set s_k_padded, 65 -.set s_tmp, 66 -.set s_end, 72 +.set s_in_offset, 51 +.set s_out_offset, 65 +.set s_sub_n, 65 +.set s_k_padded, 66 +.set s_tmp, 68 +.set s_end, 74 .set v_c, 0 ; coalescing:4, needed:0, resuable:37 .set v_a, 0 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x16x1, cluster(n0,n1b,c0,c1e): 1x16x1x4 v_mov_b32 v[v_tmp], v0 @@ -847,7 +848,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 70 - .amdhsa_next_free_sgpr 72 + .amdhsa_next_free_sgpr 74 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -858,7 +859,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1x4_tb1x2x16x1_1x16x1x4 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1x4_tb1x2x16x1_1x16x1x4.kd - .sgpr_count: 78 + .sgpr_count: 80 .vgpr_count: 70 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1x4_tb1x2x16x1_1x16x1x4_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1x4_tb1x2x16x1_1x16x1x4_gkgs.s index 57b2e90073..debf3dbb59 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1x4_tb1x2x16x1_1x16x1x4_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1x4_tb1x2x16x1_1x16x1x4_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 64 -.set s_sub_n, 64 -.set s_k_padded, 65 -.set s_tmp, 66 -.set s_end, 72 +.set s_in_offset, 51 +.set s_out_offset, 65 +.set s_sub_n, 65 +.set s_k_padded, 66 +.set s_tmp, 68 +.set s_end, 74 .set v_c, 0 ; coalescing:4, needed:0, resuable:37 .set v_a, 0 @@ -329,7 +330,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x16x1, cluster(n0,n1b,c0,c1e): 1x16x1x4 v_mov_b32 v[v_tmp], v0 @@ -843,7 +844,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 70 - .amdhsa_next_free_sgpr 72 + .amdhsa_next_free_sgpr 74 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -854,7 +855,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1x4_tb1x2x16x1_1x16x1x4_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x2x1x1_1x16x1x4_tb1x2x16x1_1x16x1x4_gkgs.kd - .sgpr_count: 78 + .sgpr_count: 80 .vgpr_count: 70 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s index 47b0ec0114..adf88bba38 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 50 -.set s_k_padded, 51 -.set s_tmp, 52 -.set s_end, 58 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 51 +.set s_k_padded, 52 +.set s_tmp, 54 +.set s_end, 60 .set v_c, 0 ; coalescing:32, needed:0, resuable:42 .set v_a, 0 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x4x1x1_1x4x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x2x1, cluster(n0,n1b,c0,c1e): 1x4x1x64 v_mov_b32 v[v_tmp], v0 @@ -1142,7 +1143,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x4x1x1_1x .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 76 - .amdhsa_next_free_sgpr 58 + .amdhsa_next_free_sgpr 60 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1153,7 +1154,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64.kd - .sgpr_count: 64 + .sgpr_count: 66 .vgpr_count: 76 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s index a946c309d0..e6b4eb05de 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 50 -.set s_k_padded, 51 -.set s_tmp, 52 -.set s_end, 58 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 51 +.set s_k_padded, 52 +.set s_tmp, 54 +.set s_end, 60 .set v_c, 0 ; coalescing:16, needed:0, resuable:42 .set v_a, 0 @@ -329,7 +330,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x4x1x1_1x4x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x2x1, cluster(n0,n1b,c0,c1e): 1x4x1x64 v_mov_b32 v[v_tmp], v0 @@ -1102,7 +1103,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x4x1x1_1x .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 76 - .amdhsa_next_free_sgpr 58 + .amdhsa_next_free_sgpr 60 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1113,7 +1114,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x2x1_1x4x1x64_gkgs.kd - .sgpr_count: 64 + .sgpr_count: 66 .vgpr_count: 76 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x4x1_1x8x1x32.s index 5b3feffcdc..c120220b61 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x4x1_1x8x1x32.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 52 -.set s_sub_n, 52 -.set s_k_padded, 53 -.set s_tmp, 54 -.set s_end, 60 +.set s_in_offset, 51 +.set s_out_offset, 53 +.set s_sub_n, 53 +.set s_k_padded, 54 +.set s_tmp, 56 +.set s_end, 62 .set v_c, 0 ; coalescing:32, needed:0, resuable:48 .set v_a, 0 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x4x1x2_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x4x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -1079,7 +1080,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x4x1x2_ .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 82 - .amdhsa_next_free_sgpr 60 + .amdhsa_next_free_sgpr 62 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1090,7 +1091,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x4x1_1x8x1x32 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x4x1_1x8x1x32.kd - .sgpr_count: 66 + .sgpr_count: 68 .vgpr_count: 82 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s index 7dd7c7aa77..39263ba236 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 52 -.set s_sub_n, 52 -.set s_k_padded, 53 -.set s_tmp, 54 -.set s_end, 60 +.set s_in_offset, 51 +.set s_out_offset, 53 +.set s_sub_n, 53 +.set s_k_padded, 54 +.set s_tmp, 56 +.set s_end, 62 .set v_c, 0 ; coalescing:16, needed:0, resuable:48 .set v_a, 0 @@ -329,7 +330,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x4x1x2_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x4x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -1039,7 +1040,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x4x1x2_ .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 82 - .amdhsa_next_free_sgpr 60 + .amdhsa_next_free_sgpr 62 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1050,7 +1051,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x4x1_1x8x1x32_gkgs.kd - .sgpr_count: 66 + .sgpr_count: 68 .vgpr_count: 82 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x16x32_wt64x4x4_ws1x1_wr1x1_ta1x2x1x4_1x16x1x16_tb1x2x1x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x16x32_wt64x4x4_ws1x1_wr1x1_ta1x2x1x4_1x16x1x16_tb1x2x1x1_1x16x1x16.s index 9fb15862d9..0c6420270f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x16x32_wt64x4x4_ws1x1_wr1x1_ta1x2x1x4_1x16x1x16_tb1x2x1x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x16x32_wt64x4x4_ws1x1_wr1x1_ta1x2x1x4_1x16x1x16_tb1x2x1x1_1x16x1x16.s @@ -238,30 +238,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 51 -.set s_k_padded, 52 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 52 +.set s_k_padded, 53 .set s_tmp, 54 .set s_end, 60 @@ -327,7 +328,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x16x32_wt64x4x4_ws1x1_wr1x1_ta1x2x1x4_1x16x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x1x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x16x32_wt64x4x4_ws1x1_wr1x1_ta1x2x1x4_1x16x1x16_tb1x2x1x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x16x32_wt64x4x4_ws1x1_wr1x1_ta1x2x1x4_1x16x1x16_tb1x2x1x1_1x16x1x16_gkgs.s index 73a2c4be31..c185ca7a06 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x16x32_wt64x4x4_ws1x1_wr1x1_ta1x2x1x4_1x16x1x16_tb1x2x1x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x16x32_wt64x4x4_ws1x1_wr1x1_ta1x2x1x4_1x16x1x16_tb1x2x1x1_1x16x1x16_gkgs.s @@ -239,30 +239,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 51 -.set s_k_padded, 52 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 52 +.set s_k_padded, 53 .set s_tmp, 54 .set s_end, 60 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x16x32_wt64x4x4_ws1x1_wr1x1_ta1x2x1x4_1x16x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x1x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s index 6191e8422c..9461471c79 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 52 -.set s_sub_n, 52 -.set s_k_padded, 53 -.set s_tmp, 54 -.set s_end, 60 +.set s_in_offset, 51 +.set s_out_offset, 53 +.set s_sub_n, 53 +.set s_k_padded, 54 +.set s_tmp, 56 +.set s_end, 62 .set v_c, 0 ; coalescing:32, needed:0, resuable:38 .set v_a, 0 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x1_1x4 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x4x1, cluster(n0,n1b,c0,c1e): 1x4x1x64 v_mov_b32 v[v_tmp], v0 @@ -1349,7 +1350,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x1_1 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 72 - .amdhsa_next_free_sgpr 60 + .amdhsa_next_free_sgpr 62 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1360,7 +1361,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64.kd - .sgpr_count: 66 + .sgpr_count: 68 .vgpr_count: 72 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s index 7647851b95..700c8f497f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 52 -.set s_sub_n, 52 -.set s_k_padded, 53 -.set s_tmp, 54 -.set s_end, 60 +.set s_in_offset, 51 +.set s_out_offset, 53 +.set s_sub_n, 53 +.set s_k_padded, 54 +.set s_tmp, 56 +.set s_end, 62 .set v_c, 0 ; coalescing:16, needed:0, resuable:38 .set v_a, 0 @@ -329,7 +330,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x1_1x4 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x4x1, cluster(n0,n1b,c0,c1e): 1x4x1x64 v_mov_b32 v[v_tmp], v0 @@ -1267,7 +1268,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x1_1 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 72 - .amdhsa_next_free_sgpr 60 + .amdhsa_next_free_sgpr 62 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1278,7 +1279,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x4x1x1_1x4x1x64_tb1x4x4x1_1x4x1x64_gkgs.kd - .sgpr_count: 66 + .sgpr_count: 68 .vgpr_count: 72 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x8x1_1x8x1x32.s index cb360ba40e..9316928b5c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x8x1_1x8x1x32.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 56 -.set s_sub_n, 56 -.set s_k_padded, 57 -.set s_tmp, 58 -.set s_end, 64 +.set s_in_offset, 51 +.set s_out_offset, 57 +.set s_sub_n, 57 +.set s_k_padded, 58 +.set s_tmp, 60 +.set s_end, 66 .set v_c, 0 ; coalescing:64, needed:16, resuable:48 .set v_a, 16 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x8 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x8x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -1418,7 +1419,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 98 - .amdhsa_next_free_sgpr 64 + .amdhsa_next_free_sgpr 66 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1429,7 +1430,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x8x1_1x8x1x32 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x8x1_1x8x1x32.kd - .sgpr_count: 70 + .sgpr_count: 72 .vgpr_count: 98 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s index 166bb8abed..0162d5fd9a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 56 -.set s_sub_n, 56 -.set s_k_padded, 57 -.set s_tmp, 58 -.set s_end, 64 +.set s_in_offset, 51 +.set s_out_offset, 57 +.set s_sub_n, 57 +.set s_k_padded, 58 +.set s_tmp, 60 +.set s_end, 66 .set v_c, 0 ; coalescing:32, needed:0, resuable:48 .set v_a, 0 @@ -329,7 +330,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x8 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x8x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -1330,7 +1331,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 82 - .amdhsa_next_free_sgpr 64 + .amdhsa_next_free_sgpr 66 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1341,7 +1342,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x8x1_1x8x1x32_gkgs.kd - .sgpr_count: 70 + .sgpr_count: 72 .vgpr_count: 82 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x1x2_1x8x1x32_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x1x2_1x8x1x32_tb1x2x1x1_1x8x1x32.s index 37d00a3969..83fb8ed895 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x1x2_1x8x1x32_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x1x2_1x8x1x32_tb1x2x1x1_1x8x1x32.s @@ -238,30 +238,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 49 -.set s_k_padded, 50 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 50 +.set s_k_padded, 51 .set s_tmp, 52 .set s_end, 58 @@ -327,7 +328,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x1x2_1x8x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x1x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x1x2_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x1x2_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s index d6fafcbe6c..92b9dd0481 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x1x2_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x1x2_1x8x1x32_tb1x2x1x1_1x8x1x32_gkgs.s @@ -239,30 +239,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 49 -.set s_k_padded, 50 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 50 +.set s_k_padded, 51 .set s_tmp, 52 .set s_end, 58 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x1x2_1x8x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x1x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x32_wt32x8x4_ws1x2_wr1x1_ta1x4x1x2_1x8x1x32_tb1x4x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x32_wt32x8x4_ws1x2_wr1x1_ta1x4x1x2_1x8x1x32_tb1x4x1x1_1x8x1x32.s index 4f200a153c..5243e1801b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x32_wt32x8x4_ws1x2_wr1x1_ta1x4x1x2_1x8x1x32_tb1x4x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x32_wt32x8x4_ws1x2_wr1x1_ta1x4x1x2_1x8x1x32_tb1x4x1x1_1x8x1x32.s @@ -238,30 +238,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 49 -.set s_k_padded, 50 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 50 +.set s_k_padded, 51 .set s_tmp, 52 .set s_end, 58 @@ -327,7 +328,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x32_wt32x8x4_ws1x2_wr1x1_ta1x4x1x2_1x8x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x1x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x32_wt32x8x4_ws1x2_wr1x1_ta1x4x1x2_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x32_wt32x8x4_ws1x2_wr1x1_ta1x4x1x2_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s index 554efc1f8a..9ccf5ee18f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x32_wt32x8x4_ws1x2_wr1x1_ta1x4x1x2_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x32_wt32x8x4_ws1x2_wr1x1_ta1x4x1x2_1x8x1x32_tb1x4x1x1_1x8x1x32_gkgs.s @@ -239,30 +239,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 49 -.set s_k_padded, 50 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 50 +.set s_k_padded, 51 .set s_tmp, 52 .set s_end, 58 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x32x32_wt32x8x4_ws1x2_wr1x1_ta1x4x1x2_1x8x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x1x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x4x32_wt64x4x4_ws1x1_wr1x1_ta1x2x1x16_1x16x1x4_tb1x2x1x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x4x32_wt64x4x4_ws1x1_wr1x1_ta1x2x1x16_1x16x1x4_tb1x2x1x1_1x16x1x4.s index a909b70b9b..eed4145442 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x4x32_wt64x4x4_ws1x1_wr1x1_ta1x2x1x16_1x16x1x4_tb1x2x1x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x4x32_wt64x4x4_ws1x1_wr1x1_ta1x2x1x16_1x16x1x4_tb1x2x1x1_1x16x1x4.s @@ -238,30 +238,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 63 -.set s_k_padded, 64 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 64 +.set s_k_padded, 65 .set s_tmp, 66 .set s_end, 72 @@ -327,7 +328,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x4x32_wt64x4x4_ws1x1_wr1x1_ta1x2x1x16_1x16x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x1x1, cluster(n0,n1b,c0,c1e): 1x16x1x4 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x4x32_wt64x4x4_ws1x1_wr1x1_ta1x2x1x16_1x16x1x4_tb1x2x1x1_1x16x1x4_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x4x32_wt64x4x4_ws1x1_wr1x1_ta1x2x1x16_1x16x1x4_tb1x2x1x1_1x16x1x4_gkgs.s index a4950cc04e..f21f5f076d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x4x32_wt64x4x4_ws1x1_wr1x1_ta1x2x1x16_1x16x1x4_tb1x2x1x1_1x16x1x4_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x4x32_wt64x4x4_ws1x1_wr1x1_ta1x2x1x16_1x16x1x4_tb1x2x1x1_1x16x1x4_gkgs.s @@ -239,30 +239,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 63 -.set s_k_padded, 64 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 64 +.set s_k_padded, 65 .set s_tmp, 66 .set s_end, 72 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x4x32_wt64x4x4_ws1x1_wr1x1_ta1x2x1x16_1x16x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x2x1x1, cluster(n0,n1b,c0,c1e): 1x16x1x4 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s index d4f3ba540d..e55c55d00d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -238,30 +238,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 49 -.set s_k_padded, 50 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 50 +.set s_k_padded, 51 .set s_tmp, 52 .set s_end, 58 @@ -327,7 +328,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x4x1x1_1x4x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x1x1, cluster(n0,n1b,c0,c1e): 1x4x1x64 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s index 0ad439df8d..25892af327 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x4x1x1_1x4x1x64_tb1x4x1x1_1x4x1x64_gkgs.s @@ -239,30 +239,31 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_block_gtc_ik, 44 -.set s_block_gtc_ic0, 45 -.set s_block_gtc_ic1e, 46 -.set s_block_gtc_in, 47 -.set s_block_gtc_ig, 48 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_block_gtc_ik, 45 +.set s_block_gtc_ic0, 46 +.set s_block_gtc_ic1e, 47 +.set s_block_gtc_in, 48 +.set s_block_gtc_ig, 49 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 49 -.set s_out_offset, 49 -.set s_sub_n, 49 -.set s_k_padded, 50 +.set s_in_offset, 50 +.set s_out_offset, 50 +.set s_sub_n, 50 +.set s_k_padded, 51 .set s_tmp, 52 .set s_end, 58 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x4x1x1_1x4x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x1x1, cluster(n0,n1b,c0,c1e): 1x4x1x64 v_mov_b32 v[v_tmp], v0 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x2x1_1x8x1x32.s index 733c8d9bd0..7e8aa623d8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x2x1_1x8x1x32.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 50 -.set s_k_padded, 51 -.set s_tmp, 52 -.set s_end, 58 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 51 +.set s_k_padded, 52 +.set s_tmp, 54 +.set s_end, 60 .set v_c, 0 ; coalescing:16, needed:0, resuable:36 .set v_a, 0 @@ -328,7 +329,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x2_1x8 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x2x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -911,7 +912,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x2_1 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 70 - .amdhsa_next_free_sgpr 58 + .amdhsa_next_free_sgpr 60 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -922,7 +923,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x2x1_1x8x1x32 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x2x1_1x8x1x32.kd - .sgpr_count: 64 + .sgpr_count: 66 .vgpr_count: 70 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s index 36e3aa0dea..4a00ef409e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 50 -.set s_k_padded, 51 -.set s_tmp, 52 -.set s_end, 58 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 51 +.set s_k_padded, 52 +.set s_tmp, 54 +.set s_end, 60 .set v_c, 0 ; coalescing:16, needed:0, resuable:36 .set v_a, 0 @@ -329,7 +330,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x2_1x8 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x4x2x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -889,7 +890,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x2_1 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 70 - .amdhsa_next_free_sgpr 58 + .amdhsa_next_free_sgpr 60 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -900,7 +901,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x1x2_1x8x1x32_tb1x4x2x1_1x8x1x32_gkgs.kd - .sgpr_count: 64 + .sgpr_count: 66 .vgpr_count: 70 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x8x1_1x16x1x16.s index d247a5fbf3..7f63868d5b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x8x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 63 -.set s_k_padded, 64 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 64 +.set s_k_padded, 65 .set s_tmp, 66 .set s_end, 72 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -683,8 +684,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -759,8 +760,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_ v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s index de6d1bd616..af65dd78dc 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 63 -.set s_k_padded, 64 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 64 +.set s_k_padded, 65 .set s_tmp, 66 .set s_end, 72 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -685,8 +686,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -761,8 +762,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_ v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x1x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x1x1_1x16x1x16.s index c26054c69c..f86c5ebb40 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x1x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x1x1_1x16x1x16.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_dim_b, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_dim_b, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 56 -.set s_k_padded, 57 -.set s_tmp, 58 -.set s_end, 64 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 57 +.set s_k_padded, 58 +.set s_tmp, 60 +.set s_end, 66 .set v_c, 0 ; coalescing:8, needed:0, resuable:33 .set v_a, 0 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x1x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -680,8 +681,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -750,8 +751,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -911,7 +912,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 72 - .amdhsa_next_free_sgpr 64 + .amdhsa_next_free_sgpr 66 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -922,7 +923,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x1x1_1x16x1x16 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x1x1_1x16x1x16.kd - .sgpr_count: 70 + .sgpr_count: 72 .vgpr_count: 72 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.s index b8307c3e82..09ff0a0cab 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_dim_b, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_dim_b, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 56 -.set s_k_padded, 57 -.set s_tmp, 58 -.set s_end, 64 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 57 +.set s_k_padded, 58 +.set s_tmp, 60 +.set s_end, 66 .set v_c, 0 ; coalescing:8, needed:0, resuable:33 .set v_a, 0 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x1x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -682,8 +683,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -752,8 +753,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -901,7 +902,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 72 - .amdhsa_next_free_sgpr 64 + .amdhsa_next_free_sgpr 66 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -912,7 +913,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt32x8x4_ws2x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.kd - .sgpr_count: 70 + .sgpr_count: 72 .vgpr_count: 72 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x32_wt32x8x4_ws2x1_wr1x1_ta1x1x1x16_1x32x1x8_tb1x1x2x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x32_wt32x8x4_ws2x1_wr1x1_ta1x1x1x16_1x32x1x8_tb1x1x2x1_1x32x1x8.s index a318e5a0a7..93390e5a49 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x32_wt32x8x4_ws2x1_wr1x1_ta1x1x1x16_1x32x1x8_tb1x1x2x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x32_wt32x8x4_ws2x1_wr1x1_ta1x1x1x16_1x32x1x8_tb1x1x2x1_1x32x1x8.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 51 -.set s_sub_n, 65 -.set s_k_padded, 66 +.set s_in_offset, 52 +.set s_out_offset, 52 +.set s_sub_n, 66 +.set s_k_padded, 67 .set s_tmp, 68 .set s_end, 74 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x32_wt32x8x4_ws2x1_wr1x1_ta1x1x1x16_1x3 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x2x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -701,8 +702,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x32_wt32x8x4_ws2x1_wr1x1_ta1x1x1x16_1x3 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -780,8 +781,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x32_wt32x8x4_ws2x1_wr1x1_ta1x1x1x16_1 s_mov_b64 exec, -1 v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+4:a_c+7] ; repeat:0x0, step:1x0, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x32_wt32x8x4_ws2x1_wr1x1_ta1x1x1x16_1x32x1x8_tb1x1x2x1_1x32x1x8_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x32_wt32x8x4_ws2x1_wr1x1_ta1x1x1x16_1x32x1x8_tb1x1x2x1_1x32x1x8_gkgs.s index 7d7c34ac77..14e23d51a4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x32_wt32x8x4_ws2x1_wr1x1_ta1x1x1x16_1x32x1x8_tb1x1x2x1_1x32x1x8_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x32_wt32x8x4_ws2x1_wr1x1_ta1x1x1x16_1x32x1x8_tb1x1x2x1_1x32x1x8_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 51 -.set s_sub_n, 65 -.set s_k_padded, 66 +.set s_in_offset, 52 +.set s_out_offset, 52 +.set s_sub_n, 66 +.set s_k_padded, 67 .set s_tmp, 68 .set s_end, 74 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x32_wt32x8x4_ws2x1_wr1x1_ta1x1x1x16_1x3 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x2x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -703,8 +704,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x32_wt32x8x4_ws2x1_wr1x1_ta1x1x1x16_1x3 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -782,8 +783,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x16x32_wt32x8x4_ws2x1_wr1x1_ta1x1x1x16_1 s_mov_b64 exec, -1 v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+4:a_c+7] ; repeat:0x0, step:1x0, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x16x1_1x16x1x16.s index 1895f058f3..228b9b0ebb 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x16x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 71 -.set s_k_padded, 72 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 72 +.set s_k_padded, 73 .set s_tmp, 74 .set s_end, 80 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -704,8 +705,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_1x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -789,8 +790,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_ s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s index aa7edb83ce..f65650543f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 71 -.set s_k_padded, 72 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 72 +.set s_k_padded, 73 .set s_tmp, 74 .set s_end, 80 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -706,8 +707,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_1x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -791,8 +792,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x1x1x8_ s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31] ; repeat:0x0, step:0x1, num_a_c:16 - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x2x1_1x16x1x16.s index 7e877be877..c2b6b56156 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x2x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 51 -.set s_sub_n, 57 -.set s_k_padded, 58 +.set s_in_offset, 52 +.set s_out_offset, 52 +.set s_sub_n, 58 +.set s_k_padded, 59 .set s_tmp, 60 .set s_end, 66 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x2x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -668,8 +669,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -738,8 +739,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s index 951761bcfb..a04a0c7457 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 51 -.set s_sub_n, 57 -.set s_k_padded, 58 +.set s_in_offset, 52 +.set s_out_offset, 52 +.set s_sub_n, 58 +.set s_k_padded, 59 .set s_tmp, 60 .set s_end, 66 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x2x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -670,8 +671,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -740,8 +741,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x16_wt64x16x4_ws1x1_wr1x1_ta1x1x1x8_1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x16_1x32x1x8_tb1x1x4x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x16_1x32x1x8_tb1x1x4x1_1x32x1x8.s index c429c9a314..8cb099f3b4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x16_1x32x1x8_tb1x1x4x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x16_1x32x1x8_tb1x1x4x1_1x32x1x8.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 67 -.set s_k_padded, 68 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 68 +.set s_k_padded, 69 .set s_tmp, 70 .set s_end, 76 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x16_1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -692,8 +693,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x16_1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -772,8 +773,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x16 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x16_1x32x1x8_tb1x1x4x1_1x32x1x8_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x16_1x32x1x8_tb1x1x4x1_1x32x1x8_gkgs.s index 42a097cc0f..384cd081d4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x16_1x32x1x8_tb1x1x4x1_1x32x1x8_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x16_1x32x1x8_tb1x1x4x1_1x32x1x8_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 67 -.set s_k_padded, 68 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 68 +.set s_k_padded, 69 .set s_tmp, 70 .set s_end, 76 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x16_1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -694,8 +695,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x16_1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -774,8 +775,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x32x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x16 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x4x1_1x16x1x16.s index b08a42c89e..1f1df4fc53 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x4x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 59 -.set s_k_padded, 60 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 60 +.set s_k_padded, 61 .set s_tmp, 62 .set s_end, 68 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -679,8 +680,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -752,8 +753,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x s_mov_b64 exec, -1 v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x0, step:0x1, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s index 6fb36332a2..d548630dac 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 59 -.set s_k_padded, 60 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 60 +.set s_k_padded, 61 .set s_tmp, 62 .set s_end, 68 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -681,8 +682,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -754,8 +755,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x1x1x8_1x s_mov_b64 exec, -1 v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x0, step:0x1, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x1x1x16_1x32x1x8_tb1x1x8x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x1x1x16_1x32x1x8_tb1x1x8x1_1x32x1x8.s index deeaabd8c1..feae0f5440 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x1x1x16_1x32x1x8_tb1x1x8x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x1x1x16_1x32x1x8_tb1x1x8x1_1x32x1x8.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 71 -.set s_k_padded, 72 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 72 +.set s_k_padded, 73 .set s_tmp, 74 .set s_end, 80 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x1x1x16_1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -707,8 +708,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x1x1x16_1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -792,8 +793,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x1x1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+4:a_c+7] ; repeat:0x0, step:1x0, num_a_c:4 - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x1x1x16_1x32x1x8_tb1x1x8x1_1x32x1x8_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x1x1x16_1x32x1x8_tb1x1x8x1_1x32x1x8_gkgs.s index 3091935670..e25e3979db 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x1x1x16_1x32x1x8_tb1x1x8x1_1x32x1x8_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x1x1x16_1x32x1x8_tb1x1x8x1_1x32x1x8_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 71 -.set s_k_padded, 72 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 72 +.set s_k_padded, 73 .set s_tmp, 74 .set s_end, 80 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x1x1x16_1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -709,8 +710,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x1x1x16_1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -794,8 +795,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x1x1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] v_mfma_f32_16x16x16f16 a[a_c+4:a_c+7], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+4:a_c+7] ; repeat:0x0, step:1x0, num_a_c:4 - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s index 5eece171bf..4a9763f895 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 57 -.set s_k_padded, 58 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 58 +.set s_k_padded, 59 .set s_tmp, 60 .set s_end, 66 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -676,8 +677,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -746,8 +747,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s index 88f8dcea1b..0dc119ef8e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 57 -.set s_k_padded, 58 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 58 +.set s_k_padded, 59 .set s_tmp, 60 .set s_end, 66 @@ -336,7 +337,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -678,8 +679,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -748,8 +749,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x16_wt8x32x4_ws1x2_wr1x1_ta1x1x1x1_1x v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x1x1x2_1x32x1x8_tb1x1x16x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x1x1x2_1x32x1x8_tb1x1x16x1_1x32x1x8.s index bdfc02d1de..b2e6f184f4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x1x1x2_1x32x1x8_tb1x1x16x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x1x1x2_1x32x1x8_tb1x1x16x1_1x32x1x8.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 65 -.set s_k_padded, 66 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 66 +.set s_k_padded, 67 .set s_tmp, 68 .set s_end, 74 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x1x1x2_1x32 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -712,8 +713,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x1x1x2_1x32 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -801,8 +802,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x1x1x2_1x s_mov_b64 exec, -1 v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x0, step:0x1, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x1x1x2_1x32x1x8_tb1x1x16x1_1x32x1x8_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x1x1x2_1x32x1x8_tb1x1x16x1_1x32x1x8_gkgs.s index be8cfe46e2..9ff0397bf6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x1x1x2_1x32x1x8_tb1x1x16x1_1x32x1x8_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x1x1x2_1x32x1x8_tb1x1x16x1_1x32x1x8_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 65 -.set s_k_padded, 66 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 66 +.set s_k_padded, 67 .set s_tmp, 68 .set s_end, 74 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x1x1x2_1x32 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -714,8 +715,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x1x1x2_1x32 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -803,8 +804,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x128x32_wt8x32x4_ws1x2_wr1x1_ta1x1x1x2_1x s_mov_b64 exec, -1 v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+4:a_c+7] ; repeat:0x0, step:0x1, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s index 9163537a3e..af3d170956 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 65 -.set s_k_padded, 66 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 66 +.set s_k_padded, 67 .set s_tmp, 68 .set s_end, 74 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -701,8 +702,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -789,8 +790,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s index d8728d630e..9e5b29d69a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 65 -.set s_k_padded, 66 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 66 +.set s_k_padded, 67 .set s_tmp, 68 .set s_end, 74 @@ -336,7 +337,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -703,8 +704,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -791,8 +792,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x1x1x2_1x32x1x8_tb1x1x32x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x1x1x2_1x32x1x8_tb1x1x32x1_1x32x1x8.s index 4ae5a98a00..63887c48cc 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x1x1x2_1x32x1x8_tb1x1x32x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x1x1x2_1x32x1x8_tb1x1x32x1_1x32x1x8.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 81 -.set s_sub_n, 81 -.set s_k_padded, 82 +.set s_in_offset, 52 +.set s_out_offset, 82 +.set s_sub_n, 82 +.set s_k_padded, 83 .set s_tmp, 84 .set s_end, 90 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x1x1x2_1x32 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x32x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -764,8 +765,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x1x1x2_1x32 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -890,8 +891,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x1x1x2_1x v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x1x1x2_1x32x1x8_tb1x1x32x1_1x32x1x8_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x1x1x2_1x32x1x8_tb1x1x32x1_1x32x1x8_gkgs.s index f16433c5be..578a0a9377 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x1x1x2_1x32x1x8_tb1x1x32x1_1x32x1x8_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x1x1x2_1x32x1x8_tb1x1x32x1_1x32x1x8_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 81 -.set s_sub_n, 81 -.set s_k_padded, 82 +.set s_in_offset, 52 +.set s_out_offset, 82 +.set s_sub_n, 82 +.set s_k_padded, 83 .set s_tmp, 84 .set s_end, 90 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x1x1x2_1x32 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x32x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -766,8 +767,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x1x1x2_1x32 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -892,8 +893,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x256x32_wt4x64x4_ws1x1_wr2x2_ta1x1x1x2_1x v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s index 9f9a9bc3d3..97048b7116 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 53 -.set s_k_padded, 54 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 54 +.set s_k_padded, 55 .set s_tmp, 56 .set s_end, 62 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -649,8 +650,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -715,8 +716,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s index cfee18a871..3576033bab 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 53 -.set s_k_padded, 54 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 54 +.set s_k_padded, 55 .set s_tmp, 56 .set s_end, 62 @@ -336,7 +337,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -651,8 +652,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -717,8 +718,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32x1x8_tb1x1x8x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32x1x8_tb1x1x8x1_1x32x1x8.s index b232edd6c6..678dbadfbb 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32x1x8_tb1x1x8x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32x1x8_tb1x1x8x1_1x32x1x8.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 57 -.set s_k_padded, 58 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 58 +.set s_k_padded, 59 .set s_tmp, 60 .set s_end, 66 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -659,8 +660,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -729,8 +730,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x3 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32x1x8_tb1x1x8x1_1x32x1x8_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32x1x8_tb1x1x8x1_1x32x1x8_gkgs.s index fbd766d542..68b2bf65ee 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32x1x8_tb1x1x8x1_1x32x1x8_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32x1x8_tb1x1x8x1_1x32x1x8_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 57 -.set s_k_padded, 58 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 58 +.set s_k_padded, 59 .set s_tmp, 60 .set s_end, 66 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -661,8 +662,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -731,8 +732,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt16x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x3 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x8x1_1x16x1x16.s index 1e8b957662..ded7ad56cf 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x8x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 71 -.set s_k_padded, 72 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 72 +.set s_k_padded, 73 .set s_tmp, 74 .set s_end, 80 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16_1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -707,8 +708,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16_1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -792,8 +793,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s index 09cd543322..2fdb7f48d4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 71 -.set s_k_padded, 72 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 72 +.set s_k_padded, 73 .set s_tmp, 74 .set s_end, 80 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16_1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -709,8 +710,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16_1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -794,8 +795,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x1x1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] v_mfma_f32_32x32x8f16 a[a_c+16:a_c+31], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+16:a_c+31] ; repeat:0x0, step:1x0, num_a_c:16 - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x1x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x1x1_1x16x1x16.s index 9da7cf7dac..65104836e7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x1x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x1x1_1x16x1x16.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_dim_b, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_dim_b, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 64 -.set s_k_padded, 65 -.set s_tmp, 66 -.set s_end, 72 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 65 +.set s_k_padded, 66 +.set s_tmp, 68 +.set s_end, 74 .set v_c, 0 ; coalescing:16, needed:0, resuable:45 .set v_a, 0 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x1x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -695,8 +696,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -777,8 +778,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1 ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:128 ; load i_k:1 into local buffer 1, repeat 0 s_waitcnt lgkmcnt(3) v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -1040,7 +1041,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 84 - .amdhsa_next_free_sgpr 72 + .amdhsa_next_free_sgpr 74 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1051,7 +1052,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x1x1_1x16x1x16 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x1x1_1x16x1x16.kd - .sgpr_count: 78 + .sgpr_count: 80 .vgpr_count: 84 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.s index 5600272e15..d771972e17 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_dim_b, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_dim_b, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 64 -.set s_k_padded, 65 -.set s_tmp, 66 -.set s_end, 72 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 65 +.set s_k_padded, 66 +.set s_tmp, 68 +.set s_end, 74 .set v_c, 0 ; coalescing:16, needed:0, resuable:45 .set v_a, 0 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x1x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -697,8 +698,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -779,8 +780,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1 ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:128 ; load i_k:1 into local buffer 1, repeat 0 s_waitcnt lgkmcnt(3) v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x1, step:0x0, num_a_c:4 - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -1018,7 +1019,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 84 - .amdhsa_next_free_sgpr 72 + .amdhsa_next_free_sgpr 74 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -1029,7 +1030,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.kd - .sgpr_count: 78 + .sgpr_count: 80 .vgpr_count: 84 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x32_wt64x4x4_ws1x1_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x2x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x32_wt64x4x4_ws1x1_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x2x1_1x32x1x8.s index 09e207a2a4..1e68f9ba2b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x32_wt64x4x4_ws1x1_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x2x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x32_wt64x4x4_ws1x1_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x2x1_1x32x1x8.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 51 -.set s_sub_n, 81 -.set s_k_padded, 82 +.set s_in_offset, 52 +.set s_out_offset, 52 +.set s_sub_n, 82 +.set s_k_padded, 83 .set s_tmp, 84 .set s_end, 90 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x32_wt64x4x4_ws1x1_wr2x2_ta1x1x1x32_1x3 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x2x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -770,8 +771,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x32_wt64x4x4_ws1x1_wr2x2_ta1x1x1x32_1x3 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -896,8 +897,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x32_wt64x4x4_ws1x1_wr2x2_ta1x1x1x32_1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x32_wt64x4x4_ws1x1_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x2x1_1x32x1x8_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x32_wt64x4x4_ws1x1_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x2x1_1x32x1x8_gkgs.s index 559263d643..5cf51c4017 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x32_wt64x4x4_ws1x1_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x2x1_1x32x1x8_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x32_wt64x4x4_ws1x1_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x2x1_1x32x1x8_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 51 -.set s_sub_n, 81 -.set s_k_padded, 82 +.set s_in_offset, 52 +.set s_out_offset, 52 +.set s_sub_n, 82 +.set s_k_padded, 83 .set s_tmp, 84 .set s_end, 90 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x32_wt64x4x4_ws1x1_wr2x2_ta1x1x1x32_1x3 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x2x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -772,8 +773,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x32_wt64x4x4_ws1x1_wr2x2_ta1x1x1x32_1x3 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -898,8 +899,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x16x32_wt64x4x4_ws1x1_wr2x2_ta1x1x1x32_1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x2x1_1x16x1x16.s index 0cac3e0833..07cbf15b4e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x2x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 51 -.set s_sub_n, 65 -.set s_k_padded, 66 +.set s_in_offset, 52 +.set s_out_offset, 52 +.set s_sub_n, 66 +.set s_k_padded, 67 .set s_tmp, 68 .set s_end, 74 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x2x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -692,8 +693,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -771,8 +772,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1 s_mov_b64 exec, -1 v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x0, step:0x1, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s index ad6a819e3f..8986b49e75 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 51 -.set s_sub_n, 65 -.set s_k_padded, 66 +.set s_in_offset, 52 +.set s_out_offset, 52 +.set s_sub_n, 66 +.set s_k_padded, 67 .set s_tmp, 68 .set s_end, 74 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x2x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -694,8 +695,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -773,8 +774,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x1x1x16_1 s_mov_b64 exec, -1 v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x0, step:0x1, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x32_wt64x4x4_ws1x2_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x4x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x32_wt64x4x4_ws1x2_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x4x1_1x32x1x8.s index 2c1bdb0ed4..2b58c1f3c5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x32_wt64x4x4_ws1x2_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x4x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x32_wt64x4x4_ws1x2_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x4x1_1x32x1x8.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 83 -.set s_k_padded, 84 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 84 +.set s_k_padded, 85 .set s_tmp, 86 .set s_end, 92 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x32_wt64x4x4_ws1x2_wr2x2_ta1x1x1x32_1x3 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -777,8 +778,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x32_wt64x4x4_ws1x2_wr2x2_ta1x1x1x32_1x3 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -911,8 +912,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x32_wt64x4x4_ws1x2_wr2x2_ta1x1x1x32_1 v_mfma_f32_4x4x4f16 a[a_c+24:a_c+27], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+24:a_c+27] ; repeat:1x1, step:0x0, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] v_mfma_f32_4x4x4f16 a[a_c+28:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+28:a_c+31] ; repeat:1x1, step:0x1, num_a_c:4 - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x32_wt64x4x4_ws1x2_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x4x1_1x32x1x8_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x32_wt64x4x4_ws1x2_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x4x1_1x32x1x8_gkgs.s index 61094a2a5d..6015564174 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x32_wt64x4x4_ws1x2_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x4x1_1x32x1x8_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x32_wt64x4x4_ws1x2_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x4x1_1x32x1x8_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 83 -.set s_k_padded, 84 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 84 +.set s_k_padded, 85 .set s_tmp, 86 .set s_end, 92 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x32_wt64x4x4_ws1x2_wr2x2_ta1x1x1x32_1x3 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -779,8 +780,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x32_wt64x4x4_ws1x2_wr2x2_ta1x1x1x32_1x3 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -913,8 +914,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x32x32_wt64x4x4_ws1x2_wr2x2_ta1x1x1x32_1 v_mfma_f32_4x4x4f16 a[a_c+24:a_c+27], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+24:a_c+27] ; repeat:1x1, step:0x0, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] v_mfma_f32_4x4x4f16 a[a_c+28:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+28:a_c+31] ; repeat:1x1, step:0x1, num_a_c:4 - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x4x1_1x16x1x16.s index 2127c58b3a..54b21dbeae 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x4x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 67 -.set s_k_padded, 68 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 68 +.set s_k_padded, 69 .set s_tmp, 70 .set s_end, 76 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -692,8 +693,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_1x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -772,8 +773,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_ v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s index 842cf9d3dc..521af0678a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 67 -.set s_k_padded, 68 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 68 +.set s_k_padded, 69 .set s_tmp, 70 .set s_end, 76 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -694,8 +695,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_1x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -774,8 +775,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x16_ v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x8x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x8x1_1x32x1x8.s index 47f22f8cc9..b2dc3a74a7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x8x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x8x1_1x32x1x8.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 87 -.set s_k_padded, 88 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 88 +.set s_k_padded, 89 .set s_tmp, 90 .set s_end, 96 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x32_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -788,8 +789,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x32_1x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -898,8 +899,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x32_ v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x8x1_1x32x1x8_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x8x1_1x32x1x8_gkgs.s index 5b32f7a39b..8b96bcd9b6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x8x1_1x32x1x8_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x32_1x32x1x8_tb1x1x8x1_1x32x1x8_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 87 -.set s_k_padded, 88 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 88 +.set s_k_padded, 89 .set s_tmp, 90 .set s_end, 96 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x32_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -790,8 +791,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x32_1x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -900,8 +901,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt256x64x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x32_ v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x8x1_1x16x1x16.s index 91134be4b6..d0bb346e85 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x8x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 57 -.set s_k_padded, 58 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 58 +.set s_k_padded, 59 .set s_tmp, 60 .set s_end, 66 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -665,8 +666,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -735,8 +736,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s index 7633870fad..271c34ede2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 57 -.set s_k_padded, 58 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 58 +.set s_k_padded, 59 .set s_tmp, 60 .set s_end, 66 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -667,8 +668,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -737,8 +738,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x1x2_1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x4_1x32x1x8_tb1x1x16x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x4_1x32x1x8_tb1x1x16x1_1x32x1x8.s index ff087d22a0..06c5974670 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x4_1x32x1x8_tb1x1x16x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x4_1x32x1x8_tb1x1x16x1_1x32x1x8.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 67 -.set s_k_padded, 68 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 68 +.set s_k_padded, 69 .set s_tmp, 70 .set s_end, 76 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x4_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -705,8 +706,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x4_1x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -787,8 +788,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x4_ v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x4_1x32x1x8_tb1x1x16x1_1x32x1x8_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x4_1x32x1x8_tb1x1x16x1_1x32x1x8_gkgs.s index 869ef3db8e..6c7cee095d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x4_1x32x1x8_tb1x1x16x1_1x32x1x8_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x4_1x32x1x8_tb1x1x16x1_1x32x1x8_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 67 -.set s_k_padded, 68 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 68 +.set s_k_padded, 69 .set s_tmp, 70 .set s_end, 76 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x4_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -707,8 +708,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x4_1x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -789,8 +790,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x128x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x4_ v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x16x1x16_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x16x1x16_tb1x1x16x1_1x16x1x16.s index 64ee2b1890..e774a44013 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x16x1x16_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x16x1x16_tb1x1x16x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 65 -.set s_k_padded, 66 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 66 +.set s_k_padded, 67 .set s_tmp, 68 .set s_end, 74 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -700,8 +701,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -793,8 +794,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x s_waitcnt lgkmcnt(5) v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+8:a_c+11] ; repeat:0x1, step:0x0, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s index 40f0cb749a..0c3b43b77b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 65 -.set s_k_padded, 66 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 66 +.set s_k_padded, 67 .set s_tmp, 68 .set s_end, 74 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -702,8 +703,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -795,8 +796,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x1x2_1x s_waitcnt lgkmcnt(5) v_mfma_f32_4x4x4f16 a[a_c+8:a_c+11], v[v_a+8:v_a+9], v[v_b+6:v_b+7], a[a_c+8:a_c+11] ; repeat:0x1, step:0x0, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x1x1x4_1x32x1x8_tb1x1x32x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x1x1x4_1x32x1x8_tb1x1x32x1_1x32x1x8.s index ab70117b3c..a4c9fa4f14 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x1x1x4_1x32x1x8_tb1x1x32x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x1x1x4_1x32x1x8_tb1x1x32x1_1x32x1x8.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 81 -.set s_sub_n, 83 -.set s_k_padded, 84 +.set s_in_offset, 52 +.set s_out_offset, 82 +.set s_sub_n, 84 +.set s_k_padded, 85 .set s_tmp, 86 .set s_end, 92 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x1x1x4_1x32 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x32x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -771,8 +772,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x1x1x4_1x32 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -905,8 +906,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x1x1x4_1x v_mfma_f32_4x4x4f16 a[a_c+24:a_c+27], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+24:a_c+27] ; repeat:1x1, step:0x0, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] v_mfma_f32_4x4x4f16 a[a_c+28:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+28:a_c+31] ; repeat:1x1, step:1x0, num_a_c:4 - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x1x1x4_1x32x1x8_tb1x1x32x1_1x32x1x8_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x1x1x4_1x32x1x8_tb1x1x32x1_1x32x1x8_gkgs.s index 989a678fd8..a5ddec742d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x1x1x4_1x32x1x8_tb1x1x32x1_1x32x1x8_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x1x1x4_1x32x1x8_tb1x1x32x1_1x32x1x8_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 81 -.set s_sub_n, 83 -.set s_k_padded, 84 +.set s_in_offset, 52 +.set s_out_offset, 82 +.set s_sub_n, 84 +.set s_k_padded, 85 .set s_tmp, 86 .set s_end, 92 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x1x1x4_1x32 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x32x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -773,8 +774,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x1x1x4_1x32 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -907,8 +908,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x256x32_wt4x64x4_ws2x1_wr2x2_ta1x1x1x4_1x v_mfma_f32_4x4x4f16 a[a_c+24:a_c+27], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+24:a_c+27] ; repeat:1x1, step:0x0, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] v_mfma_f32_4x4x4f16 a[a_c+28:a_c+31], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+28:a_c+31] ; repeat:1x1, step:1x0, num_a_c:4 - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x2x1_1x16x1x16.s index 049830680e..d24c99dfaf 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x2x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 51 -.set s_sub_n, 51 -.set s_k_padded, 52 +.set s_in_offset, 52 +.set s_out_offset, 52 +.set s_sub_n, 52 +.set s_k_padded, 53 .set s_tmp, 54 .set s_end, 60 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x2x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -660,8 +661,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -724,8 +725,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s index 7342098676..c7cdfb7c51 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 51 -.set s_sub_n, 51 -.set s_k_padded, 52 +.set s_in_offset, 52 +.set s_out_offset, 52 +.set s_sub_n, 52 +.set s_k_padded, 53 .set s_tmp, 54 .set s_end, 60 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x2x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -662,8 +663,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -726,8 +727,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x1x2_1x v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x1x1x4_1x32x1x8_tb1x1x4x1_1x32x1x8_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x1x1x4_1x32x1x8_tb1x1x4x1_1x32x1x8_gkgs.s index ad05e9c06f..fc8acdbd3d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x1x1x4_1x32x1x8_tb1x1x4x1_1x32x1x8_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x1x1x4_1x32x1x8_tb1x1x4x1_1x32x1x8_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 55 -.set s_k_padded, 56 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 56 +.set s_k_padded, 57 .set s_tmp, 58 .set s_end, 64 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x1x1x4_1x3 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -664,8 +665,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x1x1x4_1x3 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -732,8 +733,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x1x1x4_1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x4x1_1x16x1x16.s index d7665054cf..f2d9809ef0 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x4x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 53 -.set s_k_padded, 54 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 54 +.set s_k_padded, 55 .set s_tmp, 56 .set s_end, 62 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x16x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -665,8 +666,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x16x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -731,8 +732,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s index 7358c3b791..6159ad6844 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 53 -.set s_k_padded, 54 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 54 +.set s_k_padded, 55 .set s_tmp, 56 .set s_end, 62 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x16x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -667,8 +668,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x16x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -733,8 +734,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x1x2_1x1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x1x1x4_1x32x1x8_tb1x1x8x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x1x1x4_1x32x1x8_tb1x1x8x1_1x32x1x8.s index eb88321665..420e06e581 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x1x1x4_1x32x1x8_tb1x1x8x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x1x1x4_1x32x1x8_tb1x1x8x1_1x32x1x8.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 59 -.set s_k_padded, 60 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 60 +.set s_k_padded, 61 .set s_tmp, 62 .set s_end, 68 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x1x1x4_1x32x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -682,8 +683,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x1x1x4_1x32x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -755,8 +756,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x1x1x4_1x3 s_mov_b64 exec, -1 v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+4:a_c+7] ; repeat:0x0, step:1x0, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x1x1x4_1x32x1x8_tb1x1x8x1_1x32x1x8_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x1x1x4_1x32x1x8_tb1x1x8x1_1x32x1x8_gkgs.s index e24e74dd6f..ba746e7e10 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x1x1x4_1x32x1x8_tb1x1x8x1_1x32x1x8_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x1x1x4_1x32x1x8_tb1x1x8x1_1x32x1x8_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 59 -.set s_k_padded, 60 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 60 +.set s_k_padded, 61 .set s_tmp, 62 .set s_end, 68 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x1x1x4_1x32x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -684,8 +685,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x1x1x4_1x32x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -757,8 +758,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt32x64x32_wt8x32x4_ws2x1_wr1x1_ta1x1x1x4_1x3 s_mov_b64 exec, -1 v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+4:a_c+7] ; repeat:0x0, step:1x0, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4.s index 9c6661a916..94cb9bf39e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 65 -.set s_k_padded, 66 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 66 +.set s_k_padded, 67 .set s_tmp, 68 .set s_end, 74 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x16x1x4 v_mov_b32 v[v_tmp], v0 @@ -677,8 +678,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -755,8 +756,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4_gkgs.s index aba935fdf8..931cecf128 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 65 -.set s_k_padded, 66 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 66 +.set s_k_padded, 67 .set s_tmp, 68 .set s_end, 74 @@ -336,7 +337,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x16x1x4 v_mov_b32 v[v_tmp], v0 @@ -679,8 +680,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -757,8 +758,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32x1x2_tb1x1x32x1_1x32x1x2.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32x1x2_tb1x1x32x1_1x32x1x2.s index 747119d756..9ca4649aec 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32x1x2_tb1x1x32x1_1x32x1x2.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32x1x2_tb1x1x32x1_1x32x1x2.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 81 -.set s_sub_n, 81 -.set s_k_padded, 82 +.set s_in_offset, 52 +.set s_out_offset, 82 +.set s_sub_n, 82 +.set s_k_padded, 83 .set s_tmp, 84 .set s_end, 90 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x32x1, cluster(n0,n1b,c0,c1e): 1x32x1x2 v_mov_b32 v[v_tmp], v0 @@ -723,8 +724,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -817,8 +818,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32x1x2_tb1x1x32x1_1x32x1x2_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32x1x2_tb1x1x32x1_1x32x1x2_gkgs.s index 69952eef69..2e12ddc2b7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32x1x2_tb1x1x32x1_1x32x1x2_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32x1x2_tb1x1x32x1_1x32x1x2_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 81 -.set s_sub_n, 81 -.set s_k_padded, 82 +.set s_in_offset, 52 +.set s_out_offset, 82 +.set s_sub_n, 82 +.set s_k_padded, 83 .set s_tmp, 84 .set s_end, 90 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x32x1, cluster(n0,n1b,c0,c1e): 1x32x1x2 v_mov_b32 v[v_tmp], v0 @@ -725,8 +726,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -819,8 +820,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt4x64x32_wt4x64x4_ws1x1_wr1x1_ta1x1x1x2_1x32 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x8x1_1x16x1x16.s index d9a6300c4f..ad472e0619 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x8x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 59 -.set s_k_padded, 60 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 60 +.set s_k_padded, 61 .set s_tmp, 62 .set s_end, 68 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -679,8 +680,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -752,8 +753,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x s_mov_b64 exec, -1 v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+4:a_c+7] ; repeat:0x0, step:1x0, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s index ca136765af..69edf98314 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x8x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 59 -.set s_k_padded, 60 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 60 +.set s_k_padded, 61 .set s_tmp, 62 .set s_end, 68 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -681,8 +682,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -754,8 +755,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x1x1x4_1x s_mov_b64 exec, -1 v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+4:a_c+7] ; repeat:0x0, step:1x0, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x1x1x8_1x32x1x8_tb1x1x16x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x1x1x8_1x32x1x8_tb1x1x16x1_1x32x1x8.s index e082d7c96f..b6388cb0a4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x1x1x8_1x32x1x8_tb1x1x16x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x1x1x8_1x32x1x8_tb1x1x16x1_1x32x1x8.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 71 -.set s_k_padded, 72 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 72 +.set s_k_padded, 73 .set s_tmp, 74 .set s_end, 80 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x1x1x8_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -727,8 +728,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x1x1x8_1x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -818,8 +819,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x1x1x8_ s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] v_mfma_f32_16x16x16f16 a[a_c+28:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+28:a_c+31] ; repeat:1x1, step:0x1, num_a_c:4 - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x1x1x8_1x32x1x8_tb1x1x16x1_1x32x1x8_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x1x1x8_1x32x1x8_tb1x1x16x1_1x32x1x8_gkgs.s index 213c4607d1..ce1ed037f5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x1x1x8_1x32x1x8_tb1x1x16x1_1x32x1x8_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x1x1x8_1x32x1x8_tb1x1x16x1_1x32x1x8_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 71 -.set s_k_padded, 72 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 72 +.set s_k_padded, 73 .set s_tmp, 74 .set s_end, 80 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x1x1x8_1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -729,8 +730,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x1x1x8_1x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -820,8 +821,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x128x32_wt16x16x16_ws1x2_wr2x2_ta1x1x1x8_ s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] v_mfma_f32_16x16x16f16 a[a_c+28:a_c+31], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+28:a_c+31] ; repeat:1x1, step:0x1, num_a_c:4 - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x1x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x1x1_1x16x1x16.s index 09e4fb69cd..d10013c611 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x1x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x1x1_1x16x1x16.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_dim_b, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_dim_b, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 52 -.set s_k_padded, 53 -.set s_tmp, 54 -.set s_end, 60 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 53 +.set s_k_padded, 54 +.set s_tmp, 56 +.set s_end, 62 .set v_c, 0 ; coalescing:4, needed:0, resuable:25 .set v_a, 0 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x1x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -653,8 +654,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -719,8 +720,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -845,7 +846,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x1 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 64 - .amdhsa_next_free_sgpr 60 + .amdhsa_next_free_sgpr 62 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -856,7 +857,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x1x1_1x16x1x16 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x1x1_1x16x1x16.kd - .sgpr_count: 66 + .sgpr_count: 68 .vgpr_count: 64 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.s index 5605974ee2..2476c8c265 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_dim_b, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_dim_b, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 52 -.set s_k_padded, 53 -.set s_tmp, 54 -.set s_end, 60 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 53 +.set s_k_padded, 54 +.set s_tmp, 56 +.set s_end, 62 .set v_c, 0 ; coalescing:4, needed:0, resuable:25 .set v_a, 0 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x1x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -655,8 +656,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -721,8 +722,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -841,7 +842,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x1 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 64 - .amdhsa_next_free_sgpr 60 + .amdhsa_next_free_sgpr 62 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -852,7 +853,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x1x1_1x16x1x16_gkgs.kd - .sgpr_count: 66 + .sgpr_count: 68 .vgpr_count: 64 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x8_1x32x1x8_tb1x1x2x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x8_1x32x1x8_tb1x1x2x1_1x32x1x8.s index f8645ec9b7..c295919d4c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x8_1x32x1x8_tb1x1x2x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x8_1x32x1x8_tb1x1x2x1_1x32x1x8.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 51 -.set s_sub_n, 57 -.set s_k_padded, 58 +.set s_in_offset, 52 +.set s_out_offset, 52 +.set s_sub_n, 58 +.set s_k_padded, 59 .set s_tmp, 60 .set s_end, 66 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x8_1x32x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x2x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -662,8 +663,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x8_1x32x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -732,8 +733,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x8_1x3 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x8_1x32x1x8_tb1x1x2x1_1x32x1x8_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x8_1x32x1x8_tb1x1x2x1_1x32x1x8_gkgs.s index 7c4f7394af..65385fac1f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x8_1x32x1x8_tb1x1x2x1_1x32x1x8_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x8_1x32x1x8_tb1x1x2x1_1x32x1x8_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 51 -.set s_sub_n, 57 -.set s_k_padded, 58 +.set s_in_offset, 52 +.set s_out_offset, 52 +.set s_sub_n, 58 +.set s_k_padded, 59 .set s_tmp, 60 .set s_end, 66 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x8_1x32x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x2x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -664,8 +665,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x8_1x32x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -734,8 +735,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x16x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x8_1x3 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x16x1_1x16x1x16.s index 3e899d44a5..783934029e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x16x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 67 -.set s_k_padded, 68 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 68 +.set s_k_padded, 69 .set s_tmp, 70 .set s_end, 76 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -702,8 +703,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -784,8 +785,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s index 8e0feecd48..cb6da8fb71 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1x16x1x16_tb1x1x16x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 65 -.set s_sub_n, 67 -.set s_k_padded, 68 +.set s_in_offset, 52 +.set s_out_offset, 66 +.set s_sub_n, 68 +.set s_k_padded, 69 .set s_tmp, 70 .set s_end, 76 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x16x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -704,8 +705,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -786,8 +787,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x1x1x4_1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x32x1x8_tb1x1x32x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x32x1x8_tb1x1x32x1_1x32x1x8.s index 4de8c6e7d0..2ac8388770 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x32x1x8_tb1x1x32x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x32x1x8_tb1x1x32x1_1x32x1x8.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 81 -.set s_sub_n, 87 -.set s_k_padded, 88 +.set s_in_offset, 52 +.set s_out_offset, 82 +.set s_sub_n, 88 +.set s_k_padded, 89 .set s_tmp, 90 .set s_end, 96 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x3 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x32x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -782,8 +783,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x3 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -892,8 +893,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x32x1x8_tb1x1x32x1_1x32x1x8_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x32x1x8_tb1x1x32x1_1x32x1x8_gkgs.s index a03f3f8057..a2a8724e6d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x32x1x8_tb1x1x32x1_1x32x1x8_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x32x1x8_tb1x1x32x1_1x32x1x8_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 81 -.set s_sub_n, 87 -.set s_k_padded, 88 +.set s_in_offset, 52 +.set s_out_offset, 82 +.set s_sub_n, 88 +.set s_k_padded, 89 .set s_tmp, 90 .set s_end, 96 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x3 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x32x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -784,8 +785,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1x3 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -894,8 +895,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x1x1x8_1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x2x1_1x16x1x16.s index 4b7927a2ee..11a7e736ad 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x2x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 51 -.set s_sub_n, 53 -.set s_k_padded, 54 +.set s_in_offset, 52 +.set s_out_offset, 52 +.set s_sub_n, 54 +.set s_k_padded, 55 .set s_tmp, 56 .set s_end, 62 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x16x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x2x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -662,8 +663,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x16x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -728,8 +729,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s index c006ad5f75..63945830d2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x2x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 51 -.set s_sub_n, 53 -.set s_k_padded, 54 +.set s_in_offset, 52 +.set s_out_offset, 52 +.set s_sub_n, 54 +.set s_k_padded, 55 .set s_tmp, 56 .set s_end, 62 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x16x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x2x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -664,8 +665,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x16x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -730,8 +731,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x1x1x4_1x1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x32_wt32x8x4_ws1x2_wr1x1_ta1x1x1x8_1x32x1x8_tb1x1x4x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x32_wt32x8x4_ws1x2_wr1x1_ta1x1x1x8_1x32x1x8_tb1x1x4x1_1x32x1x8.s index c06b169134..7ef06d4bc1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x32_wt32x8x4_ws1x2_wr1x1_ta1x1x1x8_1x32x1x8_tb1x1x4x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x32_wt32x8x4_ws1x2_wr1x1_ta1x1x1x8_1x32x1x8_tb1x1x4x1_1x32x1x8.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 59 -.set s_k_padded, 60 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 60 +.set s_k_padded, 61 .set s_tmp, 62 .set s_end, 68 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x32_wt32x8x4_ws1x2_wr1x1_ta1x1x1x8_1x32x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -679,8 +680,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x32_wt32x8x4_ws1x2_wr1x1_ta1x1x1x8_1x32x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -752,8 +753,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x32_wt32x8x4_ws1x2_wr1x1_ta1x1x1x8_1x3 s_mov_b64 exec, -1 v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x0, step:0x1, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x32_wt32x8x4_ws1x2_wr1x1_ta1x1x1x8_1x32x1x8_tb1x1x4x1_1x32x1x8_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x32_wt32x8x4_ws1x2_wr1x1_ta1x1x1x8_1x32x1x8_tb1x1x4x1_1x32x1x8_gkgs.s index 7dede936c6..d2d646b442 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x32_wt32x8x4_ws1x2_wr1x1_ta1x1x1x8_1x32x1x8_tb1x1x4x1_1x32x1x8_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x32_wt32x8x4_ws1x2_wr1x1_ta1x1x1x8_1x32x1x8_tb1x1x4x1_1x32x1x8_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 59 -.set s_k_padded, 60 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 60 +.set s_k_padded, 61 .set s_tmp, 62 .set s_end, 68 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x32_wt32x8x4_ws1x2_wr1x1_ta1x1x1x8_1x32x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -681,8 +682,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x32_wt32x8x4_ws1x2_wr1x1_ta1x1x1x8_1x32x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -754,8 +755,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x32x32_wt32x8x4_ws1x2_wr1x1_ta1x1x1x8_1x3 s_mov_b64 exec, -1 v_mfma_f32_4x4x4f16 a[a_c+4:a_c+7], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+4:a_c+7] ; repeat:0x0, step:0x1, num_a_c:4 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x1x4_tb1x1x1x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x1x4_tb1x1x1x1_1x16x1x4.s index 68c5ae72c1..8883ca3329 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x1x4_tb1x1x1x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x1x4_tb1x1x1x1_1x16x1x4.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_dim_b, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_dim_b, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 64 -.set s_k_padded, 65 -.set s_tmp, 66 -.set s_end, 72 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 65 +.set s_k_padded, 66 +.set s_tmp, 68 +.set s_end, 74 .set v_c, 0 ; coalescing:4, needed:0, resuable:37 .set v_a, 0 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x1x1, cluster(n0,n1b,c0,c1e): 1x16x1x4 v_mov_b32 v[v_tmp], v0 @@ -683,8 +684,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -761,8 +762,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -899,7 +900,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x1 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 76 - .amdhsa_next_free_sgpr 72 + .amdhsa_next_free_sgpr 74 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -910,7 +911,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x1x4_tb1x1x1x1_1x16x1x4 .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x1x4_tb1x1x1x1_1x16x1x4.kd - .sgpr_count: 78 + .sgpr_count: 80 .vgpr_count: 76 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x1x4_tb1x1x1x1_1x16x1x4_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x1x4_tb1x1x1x1_1x16x1x4_gkgs.s index c77f3e297e..2a48075f4d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x1x4_tb1x1x1x1_1x16x1x4_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x1x4_tb1x1x1x1_1x16x1x4_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_dim_b, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_dim_b, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 64 -.set s_k_padded, 65 -.set s_tmp, 66 -.set s_end, 72 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 65 +.set s_k_padded, 66 +.set s_tmp, 68 +.set s_end, 74 .set v_c, 0 ; coalescing:4, needed:0, resuable:37 .set v_a, 0 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x1x1, cluster(n0,n1b,c0,c1e): 1x16x1x4 v_mov_b32 v[v_tmp], v0 @@ -685,8 +686,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -763,8 +764,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -895,7 +896,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x1 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 76 - .amdhsa_next_free_sgpr 72 + .amdhsa_next_free_sgpr 74 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -906,7 +907,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x1x4_tb1x1x1x1_1x16x1x4_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x1x1x16_1x16x1x4_tb1x1x1x1_1x16x1x4_gkgs.kd - .sgpr_count: 78 + .sgpr_count: 80 .vgpr_count: 76 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x32_1x32x1x2_tb1x1x2x1_1x32x1x2.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x32_1x32x1x2_tb1x1x2x1_1x32x1x2.s index f8cf03d498..eaf04a9b02 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x32_1x32x1x2_tb1x1x2x1_1x32x1x2.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x32_1x32x1x2_tb1x1x2x1_1x32x1x2.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 51 -.set s_sub_n, 81 -.set s_k_padded, 82 +.set s_in_offset, 52 +.set s_out_offset, 52 +.set s_sub_n, 82 +.set s_k_padded, 83 .set s_tmp, 84 .set s_end, 90 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x32_1x32x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x2x1, cluster(n0,n1b,c0,c1e): 1x32x1x2 v_mov_b32 v[v_tmp], v0 @@ -728,8 +729,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x32_1x32x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -822,8 +823,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x32_1x3 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x32_1x32x1x2_tb1x1x2x1_1x32x1x2_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x32_1x32x1x2_tb1x1x2x1_1x32x1x2_gkgs.s index f19decf32f..6ff3cdf841 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x32_1x32x1x2_tb1x1x2x1_1x32x1x2_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x32_1x32x1x2_tb1x1x2x1_1x32x1x2_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 51 -.set s_sub_n, 81 -.set s_k_padded, 82 +.set s_in_offset, 52 +.set s_out_offset, 52 +.set s_sub_n, 82 +.set s_k_padded, 83 .set s_tmp, 84 .set s_end, 90 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x32_1x32x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x2x1, cluster(n0,n1b,c0,c1e): 1x32x1x2 v_mov_b32 v[v_tmp], v0 @@ -730,8 +731,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x32_1x32x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -824,8 +825,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x4x32_wt64x4x4_ws1x1_wr1x1_ta1x1x1x32_1x3 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x4x1_1x16x1x16.s index ffafd3d75f..0d46f1e84a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x4x1_1x16x1x16.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 55 -.set s_k_padded, 56 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 56 +.set s_k_padded, 57 .set s_tmp, 58 .set s_end, 64 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -662,8 +663,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -730,8 +731,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s index 8045fc0d7a..6ba29a9027 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x16x1x16_tb1x1x4x1_1x16x1x16_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 53 -.set s_sub_n, 55 -.set s_k_padded, 56 +.set s_in_offset, 52 +.set s_out_offset, 54 +.set s_sub_n, 56 +.set s_k_padded, 57 .set s_tmp, 58 .set s_end, 64 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x16 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x4x1, cluster(n0,n1b,c0,c1e): 1x16x1x16 v_mov_b32 v[v_tmp], v0 @@ -664,8 +665,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x16 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -732,8 +733,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x16_wt32x32x8_ws1x1_wr1x1_ta1x1x1x4_1x v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x8_1x32x1x8_tb1x1x8x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x8_1x32x1x8_tb1x1x8x1_1x32x1x8.s index 6e7a0c5b3f..502c3dae72 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x8_1x32x1x8_tb1x1x8x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x8_1x32x1x8_tb1x1x8x1_1x32x1x8.s @@ -238,32 +238,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 63 -.set s_k_padded, 64 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 64 +.set s_k_padded, 65 .set s_tmp, 66 .set s_end, 72 @@ -334,7 +335,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x8_1x3 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -683,8 +684,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x8_1x3 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -759,8 +760,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x8_1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x8_1x32x1x8_tb1x1x8x1_1x32x1x8_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x8_1x32x1x8_tb1x1x8x1_1x32x1x8_gkgs.s index 5b65e3a2ec..6b8b219806 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x8_1x32x1x8_tb1x1x8x1_1x32x1x8_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x8_1x32x1x8_tb1x1x8x1_1x32x1x8_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 57 -.set s_sub_n, 63 -.set s_k_padded, 64 +.set s_in_offset, 52 +.set s_out_offset, 58 +.set s_sub_n, 64 +.set s_k_padded, 65 .set s_tmp, 66 .set s_end, 72 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x8_1x3 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x8x1, cluster(n0,n1b,c0,c1e): 1x32x1x8 v_mov_b32 v[v_tmp], v0 @@ -685,8 +686,8 @@ igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x8_1x3 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -761,8 +762,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x1x1x8_1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s index 8e4496363f..35693fb85d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s @@ -238,33 +238,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_dim_b, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_dim_b, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 50 -.set s_k_padded, 51 -.set s_tmp, 52 -.set s_end, 58 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 51 +.set s_k_padded, 52 +.set s_tmp, 54 +.set s_end, 60 .set v_c, 0 ; coalescing:4, needed:0, resuable:22 .set v_a, 0 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x1x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -665,8 +666,8 @@ igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -729,8 +730,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -854,7 +855,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 62 - .amdhsa_next_free_sgpr 58 + .amdhsa_next_free_sgpr 60 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -865,7 +866,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32 .symbol: igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.kd - .sgpr_count: 64 + .sgpr_count: 66 .vgpr_count: 62 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32_gkgs.s index b7a68b40a8..3c2a752064 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_dim_b, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_dim_b, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 50 -.set s_k_padded, 51 -.set s_tmp, 52 -.set s_end, 58 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 51 +.set s_k_padded, 52 +.set s_tmp, 54 +.set s_end, 60 .set v_c, 0 ; coalescing:4, needed:0, resuable:22 .set v_a, 0 @@ -336,7 +337,7 @@ igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x1x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -665,8 +666,8 @@ igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -727,8 +728,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -848,7 +849,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 62 - .amdhsa_next_free_sgpr 58 + .amdhsa_next_free_sgpr 60 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -859,7 +860,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32_gkgs.kd - .sgpr_count: 64 + .sgpr_count: 66 .vgpr_count: 62 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32_gkgs.s index e05d78a77c..3b6f24d9db 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c0, 35 -.set s_in_stride_c, 36 -.set s_in_stride_n, 37 -.set s_wei_stride_c, 38 -.set s_wei_stride_k, 39 -.set s_out_stride_n_n1, 40 -.set s_in_stride_n_n1, 41 -.set s_move_slice_n_n1, 42 -.set s_move_slice_n_dsho, 43 -.set s_move_slice_n_dswo, 44 -.set s_dim_b, 45 -.set s_block_gtc_ik, 46 -.set s_block_gtc_ic0, 47 -.set s_block_gtc_ic1e, 48 -.set s_block_gtc_in, 49 -.set s_block_gtc_ig, 50 +.set s_out_stride_n, 35 +.set s_in_stride_c0, 36 +.set s_in_stride_c, 37 +.set s_in_stride_n, 38 +.set s_wei_stride_c, 39 +.set s_wei_stride_k, 40 +.set s_out_stride_n_n1, 41 +.set s_in_stride_n_n1, 42 +.set s_move_slice_n_n1, 43 +.set s_move_slice_n_dsho, 44 +.set s_move_slice_n_dswo, 45 +.set s_dim_b, 46 +.set s_block_gtc_ik, 47 +.set s_block_gtc_ic0, 48 +.set s_block_gtc_ic1e, 49 +.set s_block_gtc_in, 50 +.set s_block_gtc_ig, 51 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 51 -.set s_out_offset, 51 -.set s_sub_n, 51 -.set s_k_padded, 52 +.set s_in_offset, 52 +.set s_out_offset, 52 +.set s_sub_n, 52 +.set s_k_padded, 53 .set s_tmp, 54 .set s_end, 60 @@ -336,7 +337,7 @@ igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x2x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -664,8 +665,8 @@ igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -728,8 +729,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x1x2_1x8x1x32_tb1x1x1x1_1x8x1x32_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x1x2_1x8x1x32_tb1x1x1x1_1x8x1x32_gkgs.s index 85bfeb588e..ea5832d09a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x1x2_1x8x1x32_tb1x1x1x1_1x8x1x32_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x1x2_1x8x1x32_tb1x1x1x1_1x8x1x32_gkgs.s @@ -239,33 +239,34 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_dim_b, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_dim_b, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 50 -.set s_sub_n, 50 -.set s_k_padded, 51 -.set s_tmp, 52 -.set s_end, 58 +.set s_in_offset, 51 +.set s_out_offset, 51 +.set s_sub_n, 51 +.set s_k_padded, 52 +.set s_tmp, 54 +.set s_end, 60 .set v_c, 0 ; coalescing:8, needed:0, resuable:27 .set v_a, 0 @@ -335,7 +336,7 @@ igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x1x2_1x8x1x s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x1x1x1, cluster(n0,n1b,c0,c1e): 1x8x1x32 v_mov_b32 v[v_tmp], v0 @@ -660,8 +661,8 @@ igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x1x2_1x8x1x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -722,8 +723,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x1x2_1x8x s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -845,7 +846,7 @@ L_igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x1x2_1x8x .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 66 - .amdhsa_next_free_sgpr 58 + .amdhsa_next_free_sgpr 60 .amdhsa_ieee_mode 0 .amdhsa_dx10_clamp 0 .end_amdhsa_kernel @@ -856,7 +857,7 @@ amdhsa.version: [ 1, 0 ] amdhsa.kernels: - .name: igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x1x2_1x8x1x32_tb1x1x1x1_1x8x1x32_gkgs .symbol: igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x1x2_1x8x1x32_tb1x1x1x1_1x8x1x32_gkgs.kd - .sgpr_count: 64 + .sgpr_count: 66 .vgpr_count: 66 .kernarg_segment_align: 8 .kernarg_segment_size: 96 diff --git a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s index d72013f32a..6e67d50aae 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_wrw_fp16/igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x8x1x1_1x4x1x64_tb1x8x1x1_1x4x1x64_gkgs.s @@ -239,32 +239,33 @@ .set s_x, 30 .set s_gemmk_split, 31 .set s_group, 32 -.set s_out_stride_k, 33 +.set s_ho_padded, 33 +.set s_out_stride_k, 34 .set s_hoxwo, 31 -.set s_out_stride_n, 34 -.set s_in_stride_c, 35 -.set s_in_stride_n, 36 -.set s_wei_stride_c, 37 -.set s_wei_stride_k, 38 -.set s_out_stride_n_n1, 39 -.set s_in_stride_n_n1, 40 -.set s_move_slice_n_n1, 41 -.set s_move_slice_n_dsho, 42 -.set s_move_slice_n_dswo, 43 -.set s_dim_b, 44 -.set s_block_gtc_ik, 45 -.set s_block_gtc_ic0, 46 -.set s_block_gtc_ic1e, 47 -.set s_block_gtc_in, 48 -.set s_block_gtc_ig, 49 +.set s_out_stride_n, 35 +.set s_in_stride_c, 36 +.set s_in_stride_n, 37 +.set s_wei_stride_c, 38 +.set s_wei_stride_k, 39 +.set s_out_stride_n_n1, 40 +.set s_in_stride_n_n1, 41 +.set s_move_slice_n_n1, 42 +.set s_move_slice_n_dsho, 43 +.set s_move_slice_n_dswo, 44 +.set s_dim_b, 45 +.set s_block_gtc_ik, 46 +.set s_block_gtc_ic0, 47 +.set s_block_gtc_ic1e, 48 +.set s_block_gtc_in, 49 +.set s_block_gtc_ig, 50 .set s_knum, 1 .set s_gemm_k_num_n1, 0 .set s_kitr, 3 -.set s_in_offset, 50 -.set s_out_offset, 56 -.set s_sub_n, 56 -.set s_in_stride_wo, 57 -.set s_k_padded, 58 +.set s_in_offset, 51 +.set s_out_offset, 57 +.set s_sub_n, 57 +.set s_in_stride_wo, 58 +.set s_k_padded, 59 .set s_tmp, 60 .set s_end, 66 @@ -337,7 +338,7 @@ igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x8x1x1_1x4 s_load_dwordx2 s[s_p_wei+0:s_p_wei+1], s[s_ka+0:s_ka+1], 0+k_p_wei s_load_dwordx2 s[s_p_out+0:s_p_out+1], s[s_ka+0:s_ka+1], 0+k_p_out s_load_dwordx16 s[s_hi+0:s_hi+15], s[s_ka+0:s_ka+1], 0+k_hi - s_load_dword s[s_group], s[s_ka+0:s_ka+1], 0+k_group + s_load_dwordx2 s[s_group+0:s_group+1], s[s_ka+0:s_ka+1], 0+k_group ; input, thread(n0,n1b,c0,c1e): 1x8x1x1, cluster(n0,n1b,c0,c1e): 1x4x1x64 v_mov_b32 v[v_tmp], v0 @@ -678,8 +679,8 @@ igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x8x1x1_1x4 s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] @@ -749,8 +750,8 @@ L_igemm_wrw_gtcx_nchw_fp16_bx8_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x8x1x1_1 v_add_u32 v[v_move_slice_n_idsho], 1, v[v_move_slice_n_idsho] s_mov_b64 exec, -1 v_add_u32 v[v_move_slice_n_idsho], s[s_move_slice_n_dsho], v[v_move_slice_n_idsho] - v_cmpx_le_u32 vcc, s[s_ho], v[v_move_slice_n_idsho] - v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho], v[v_move_slice_n_idsho] + v_cmpx_le_u32 vcc, s[s_ho_padded], v[v_move_slice_n_idsho] + v_subrev_u32 v[v_move_slice_n_idsho], s[s_ho_padded], v[v_move_slice_n_idsho] v_add_u32 v[v_move_slice_n_in1], 1, v[v_move_slice_n_in1] v_add_u32 v[v_in_os_base], s[s_in_stride_n], v[v_in_os_base] v_add_u32 v[v_out_os_base], s[s_out_stride_n], v[v_out_os_base] diff --git a/src/solver/conv_asm_implicit_gemm_wrw_gtc_dynamic_xdlops.cpp b/src/solver/conv_asm_implicit_gemm_wrw_gtc_dynamic_xdlops.cpp index f1aaf003a2..4577936b3a 100644 --- a/src/solver/conv_asm_implicit_gemm_wrw_gtc_dynamic_xdlops.cpp +++ b/src/solver/conv_asm_implicit_gemm_wrw_gtc_dynamic_xdlops.cpp @@ -445,7 +445,8 @@ static inline int if_gemm_k_global_split(const ConvolutionContext& ctx, inline std::vector ComputeDynamicIGemmWrwKernelArgs(const conv::ProblemDescription& conv_problem, - const int log2_gemm_k_global_splits) + const int log2_gemm_k_global_splits, + const int nxb) { int hi = conv_problem.GetOutHeight(); int wi = conv_problem.GetOutWidth(); @@ -464,6 +465,9 @@ ComputeDynamicIGemmWrwKernelArgs(const conv::ProblemDescription& conv_problem, int x = conv_problem.GetWeightsWidth(); int group = conv_problem.GetGroupCount(); + int dim_b = (ho * wo + nxb - 1) / nxb * nxb; + int ho_padded = integer_divide_ceil(dim_b, wo); + std::vector opArgs; opArgs.emplace_back(hi); opArgs.emplace_back(wi); @@ -482,6 +486,7 @@ ComputeDynamicIGemmWrwKernelArgs(const conv::ProblemDescription& conv_problem, opArgs.emplace_back(x); opArgs.emplace_back(log2_gemm_k_global_splits); opArgs.emplace_back(group); + opArgs.emplace_back(ho_padded); return opArgs; } @@ -844,6 +849,7 @@ ConvAsmImplicitGemmGTCDynamicWrwXdlops::GetSolution(const ConvolutionContext& ct int grid_size; int log2_gemm_k_global_splits; std::string kernel_name; + int nxb; std::tie(is_valid, kernel_index, block_size, grid_size, log2_gemm_k_global_splits) = FindImplicitGemmWrwGTCDynamicXdlopsKernel(ctx); @@ -852,6 +858,7 @@ ConvAsmImplicitGemmGTCDynamicWrwXdlops::GetSolution(const ConvolutionContext& ct MIOPEN_THROW("this kernel should not run with igemm dynamic!"); kernel_name = kernel_configs[kernel_index].GetKernelName(); + nxb = kernel_configs[kernel_index].nxb; // MIOPEN_LOG_I2(kernel_name << " with groups for reduction: " // << (1 << log2_gemm_k_global_splits)); @@ -892,7 +899,8 @@ ConvAsmImplicitGemmGTCDynamicWrwXdlops::GetSolution(const ConvolutionContext& ct const auto& conv_problem = ctx.conv_problem; const auto& lowp_quant = ctx.conv_problem.GetConv().lowp_quant; - auto opShapeArgs = ComputeDynamicIGemmWrwKernelArgs(conv_problem, log2_gemm_k_global_splits); + auto opShapeArgs = + ComputeDynamicIGemmWrwKernelArgs(conv_problem, log2_gemm_k_global_splits, nxb); if(conv_problem.IsFp32()) { diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 64d09d40ec..69ab4672c1 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -992,10 +992,8 @@ COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ ${MIO #regression test for issue 540 COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 4 32 79 141 --weights 64 32 5 10 --pads_strides_dilations 0 0 2 2 1 1 --disable-forward --disable-backward-data -# WORKAROUND_ISSUE_996 -# COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 400 256 7 7 --weights 1024 256 7 7 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data -# WORKAROUND_ISSUE_996 -# COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 400 256 1 1 --weights 1024 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data +COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 400 256 7 7 --weights 1024 256 7 7 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data +COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 400 256 1 1 --weights 1024 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data # WORKAROUND_ISSUE_995 # COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 1 3 32 32 --weights 1 3 11 11 --pads_strides_dilations 1 1 2 2 2 1 --disable-forward --disable-backward-data # WORKAROUND_ISSUE_995